From 5037a00992e5fcb3d8509964313565a3dab6697c Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Thu, 25 Jan 2018 17:13:37 +0200 Subject: nl80211: Introduce scan flags to emphasize requested scan behavior This commit defines new scan flags (LOW_SPAN, LOW_POWER, HIGH_LATENCY) to emphasize the requested scan behavior for the driver. These flags are optional and are mutually exclusive. The implementation of the respective functionality can be driver/hardware specific. These flags can be used to control the compromise between how long a scan takes, how much power it uses, and high accurate/complete the scan is in finding the BSSs. Signed-off-by: Sunil Dutt Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ab0c687d0c44..dd249ec9f228 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6715,8 +6715,17 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); - if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && - !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) + if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) && + !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_LOW_POWER) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_LOW_POWER_SCAN)) || + ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) && + !wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN))) return -EOPNOTSUPP; if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { -- cgit v1.2.3-70-g09d2 From 40cbfa90218bc570a7959b436b9d48a18c361041 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Thu, 25 Jan 2018 17:13:38 +0200 Subject: cfg80211/nl80211: Optional authentication offload to userspace This interface allows the host driver to offload the authentication to user space. This is exclusively defined for host drivers that do not define separate commands for authentication and association, but rely on userspace SME (e.g., in wpa_supplicant for the ~WPA_DRIVER_FLAGS_SME case) for the authentication to happen. This can be used to implement SAE without full implementation in the kernel/firmware while still being able to use NL80211_CMD_CONNECT with driver-based BSS selection. Host driver sends NL80211_CMD_EXTERNAL_AUTH event to start/abort authentication to the port on which connect is triggered and status of authentication is further indicated by user space to host driver through the same command response interface. User space entities advertise this capability through the NL80211_ATTR_EXTERNAL_AUTH_SUPP flag in the NL80211_CMD_CONNECT request. Host drivers shall look at this capability to offload the authentication. Signed-off-by: Srinivas Dasari Signed-off-by: Jouni Malinen [add socket connection ownership check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 54 +++++++++++++++++++++++-- include/uapi/linux/nl80211.h | 47 ++++++++++++++++++++++ net/wireless/nl80211.c | 94 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 15 +++++++ net/wireless/trace.h | 23 +++++++++++ 5 files changed, 230 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 81174f9b8d14..68def3e5b013 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1905,11 +1905,16 @@ struct cfg80211_auth_request { * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) * @ASSOC_REQ_DISABLE_VHT: Disable VHT * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association + * @CONNECT_REQ_EXTERNAL_AUTH_SUPPORT: User space indicates external + * authentication capability. Drivers can offload authentication to + * userspace if this flag is set. Only applicable for cfg80211_connect() + * request (connect callback). */ enum cfg80211_assoc_req_flags { - ASSOC_REQ_DISABLE_HT = BIT(0), - ASSOC_REQ_DISABLE_VHT = BIT(1), - ASSOC_REQ_USE_RRM = BIT(2), + ASSOC_REQ_DISABLE_HT = BIT(0), + ASSOC_REQ_DISABLE_VHT = BIT(1), + ASSOC_REQ_USE_RRM = BIT(2), + CONNECT_REQ_EXTERNAL_AUTH_SUPPORT = BIT(3), }; /** @@ -2600,6 +2605,33 @@ struct cfg80211_pmk_conf { const u8 *pmk_r0_name; }; +/** + * struct cfg80211_external_auth_params - Trigger External authentication. + * + * Commonly used across the external auth request and event interfaces. + * + * @action: action type / trigger for external authentication. Only significant + * for the authentication request event interface (driver to user space). + * @bssid: BSSID of the peer with which the authentication has + * to happen. Used by both the authentication request event and + * authentication response command interface. + * @ssid: SSID of the AP. Used by both the authentication request event and + * authentication response command interface. + * @key_mgmt_suite: AKM suite of the respective authentication. Used by the + * authentication request event interface. + * @status: status code, %WLAN_STATUS_SUCCESS for successful authentication, + * use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you + * the real status code for failures. Used only for the authentication + * response command interface (user space to driver). + */ +struct cfg80211_external_auth_params { + enum nl80211_external_auth_action action; + u8 bssid[ETH_ALEN] __aligned(2); + struct cfg80211_ssid ssid; + unsigned int key_mgmt_suite; + u16 status; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -2923,6 +2955,9 @@ struct cfg80211_pmk_conf { * (invoked with the wireless_dev mutex held) * @del_pmk: delete the previously configured PMK for the given authenticator. * (invoked with the wireless_dev mutex held) + * + * @external_auth: indicates result of offloaded authentication processing from + * user space */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3216,6 +3251,8 @@ struct cfg80211_ops { const struct cfg80211_pmk_conf *conf); int (*del_pmk)(struct wiphy *wiphy, struct net_device *dev, const u8 *aa); + int (*external_auth)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_external_auth_params *params); }; /* @@ -6202,6 +6239,17 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); +/** + * cfg80211_external_auth_request - userspace request for authentication + * @netdev: network device + * @params: External authentication parameters + * @gfp: allocation flags + * Returns: 0 on success, < 0 on error + */ +int cfg80211_external_auth_request(struct net_device *netdev, + struct cfg80211_external_auth_params *params, + gfp_t gfp); + /* Logging, debugging and troubleshooting/diagnostic helpers. */ /* wiphy_printk helpers, similar to dev_printk */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6f60503fa617..c2342456cf16 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -992,6 +992,27 @@ * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * + * @NL80211_CMD_EXTERNAL_AUTH: This interface is exclusively defined for host + * drivers that do not define separate commands for authentication and + * association, but rely on user space for the authentication to happen. + * This interface acts both as the event request (driver to user space) + * to trigger the authentication and command response (userspace to + * driver) to indicate the authentication status. + * + * User space uses the %NL80211_CMD_CONNECT command to the host driver to + * trigger a connection. The host driver selects a BSS and further uses + * this interface to offload only the authentication part to the user + * space. Authentication frames are passed between the driver and user + * space through the %NL80211_CMD_FRAME interface. Host driver proceeds + * further with the association after getting successful authentication + * status. User space indicates the authentication status through + * %NL80211_ATTR_STATUS_CODE attribute in %NL80211_CMD_EXTERNAL_AUTH + * command interface. + * + * Host driver reports this status on an authentication failure to the + * user space through the connect result as the user space would have + * initiated the connection through the connect request. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1198,6 +1219,8 @@ enum nl80211_commands { NL80211_CMD_RELOAD_REGDB, + NL80211_CMD_EXTERNAL_AUTH, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2153,6 +2176,16 @@ enum nl80211_commands { * @NL80211_ATTR_PMKR0_NAME: PMK-R0 Name for offloaded FT. * @NL80211_ATTR_PORT_AUTHORIZED: (reserved) * + * @NL80211_ATTR_EXTERNAL_AUTH_ACTION: Identify the requested external + * authentication operation (u32 attribute with an + * &enum nl80211_external_auth_action value). This is used with the + * &NL80211_CMD_EXTERNAL_AUTH request event. + * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user + * space supports external authentication. This attribute shall be used + * only with %NL80211_CMD_CONNECT request. The driver may offload + * authentication processing to user space if this capability is indicated + * in NL80211_CMD_CONNECT requests from the user space. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2579,6 +2612,9 @@ enum nl80211_attrs { NL80211_ATTR_PMKR0_NAME, NL80211_ATTR_PORT_AUTHORIZED, + NL80211_ATTR_EXTERNAL_AUTH_ACTION, + NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5495,4 +5531,15 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * nl80211_external_auth_action - Action to perform with external + * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. + * @NL80211_EXTERNAL_AUTH_START: Start the authentication. + * @NL80211_EXTERNAL_AUTH_ABORT: Abort the ongoing authentication. + */ +enum nl80211_external_auth_action { + NL80211_EXTERNAL_AUTH_START, + NL80211_EXTERNAL_AUTH_ABORT, +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index dd249ec9f228..aa6b64069c80 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -420,6 +420,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, + [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9161,6 +9162,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) { + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, + "external auth requires connection ownership"); + return -EINVAL; + } + connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -12469,6 +12479,41 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info) return ret; } +static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_external_auth_params params; + + if (rdev->ops->external_auth) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_SSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_BSSID]) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_STATUS_CODE]) + return -EINVAL; + + memset(¶ms, 0, sizeof(params)); + + params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + if (params.ssid.ssid_len == 0 || + params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]), + params.ssid.ssid_len); + + memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), + ETH_ALEN); + + params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + + return rdev_external_auth(rdev, dev, ¶ms); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13364,6 +13409,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_EXTERNAL_AUTH, + .doit = nl80211_external_auth, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; @@ -15375,6 +15428,47 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) nlmsg_free(msg); } +int cfg80211_external_auth_request(struct net_device *dev, + struct cfg80211_external_auth_params *params, + gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + + if (!wdev->conn_owner_nlportid) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EXTERNAL_AUTH); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, params->key_mgmt_suite) || + nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION, + params->action) || + nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) || + nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len, + params->ssid.ssid)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, + wdev->conn_owner_nlportid); + return 0; + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} +EXPORT_SYMBOL(cfg80211_external_auth_request); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 0c06240d25af..84f23ae015fc 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1190,4 +1190,19 @@ static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, trace_rdev_return_int(&rdev->wiphy, ret); return ret; } + +static inline int +rdev_external_auth(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_external_auth_params *params) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_external_auth(&rdev->wiphy, dev, params); + if (rdev->ops->external_auth) + ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index bcfedd39e7a3..5152938b358d 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2319,6 +2319,29 @@ TRACE_EVENT(rdev_del_pmk, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa)) ); +TRACE_EVENT(rdev_external_auth, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_external_auth_params *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(bssid) + __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1) + __field(u16, status) + ), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(bssid, params->bssid); + memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); + memcpy(__entry->ssid, params->ssid.ssid, + params->ssid.ssid_len); + __entry->status = params->status; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT + ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->bssid, __entry->ssid, __entry->status) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3-70-g09d2 From 10773a7c09b327d02144c7d181e6544b7015ffc7 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Thu, 25 Jan 2018 17:13:39 +0200 Subject: nl80211: Allow SAE Authentication for NL80211_CMD_CONNECT This commit allows SAE Authentication for NL80211_CMD_CONNECT interface, provided host driver advertises the support. Host drivers may offload the SAE authentication to user space through NL80211_CMD_EXTERNAL_AUTH interface and thus expect the user space to advertise support to handle offload through NL80211_ATTR_EXTERNAL_AUTH_SUPPORT in NL80211_CMD_CONNECT request. Such drivers should reject the connect request on no offload support from user space. Signed-off-by: Srinivas Dasari Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index aa6b64069c80..bdb70fe74e3c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3921,9 +3921,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_CONNECT: - /* SAE not supported yet */ - if (auth_type == NL80211_AUTHTYPE_SAE) + if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && + auth_type == NL80211_AUTHTYPE_SAE) return false; + /* FILS with SK PFS or PK not supported yet */ if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK) -- cgit v1.2.3-70-g09d2 From 25b0ba7ebbc5ffa5a64d752b85af78e6e517e3b2 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Fri, 26 Jan 2018 17:14:19 -0800 Subject: mac80211: Add txq flags to debugfs Might help one figure out why aqm drivers may fail to transmit frames sometimes. Signed-off-by: Ben Greear Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 444ea8d127fe..4105081dc1df 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -160,12 +160,12 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, bufsz+buf-p, - "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n"); + "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { txqi = to_txq_info(sta->sta.txq[i]); p += scnprintf(p, bufsz+buf-p, - "%d %d %u %u %u %u %u %u %u %u %u\n", + "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, txqi->tin.backlog_bytes, @@ -176,7 +176,11 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, txqi->tin.overlimit, txqi->tin.collisions, txqi->tin.tx_bytes, - txqi->tin.tx_packets); + txqi->tin.tx_packets, + txqi->flags, + txqi->flags & (1<flags & (1<flags & (1< Date: Wed, 31 Jan 2018 16:24:49 +0530 Subject: cfg80211: Add support to notify station's opmode change to userspace ht/vht action frames will be sent to AP from station to notify change of its ht/vht opmode(max bandwidth, smps mode or nss) modified values. Currently these valuse used by driver/firmware for rate control algorithm. This patch introduces NL80211_CMD_STA_OPMODE_CHANGED command to notify those modified/current supported values(max bandwidth, smps mode, max nss) to userspace application. This will be useful for the application like steering, which closely monitoring station's capability changes. Since the application has taken these values during station association. Signed-off-by: Tamizh chelvam Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 43 ++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 12 ++++++++++ net/wireless/nl80211.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 68def3e5b013..7d49cd0cf92d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3553,6 +3553,35 @@ enum wiphy_vendor_command_flags { WIPHY_VENDOR_CMD_NEED_RUNNING = BIT(2), }; +/** + * enum wiphy_opmode_flag - Station's ht/vht operation mode information flags + * + * @STA_OPMODE_MAX_BW_CHANGED: Max Bandwidth changed + * @STA_OPMODE_SMPS_MODE_CHANGED: SMPS mode changed + * @STA_OPMODE_N_SS_CHANGED: max N_SS (number of spatial streams) changed + * + */ +enum wiphy_opmode_flag { + STA_OPMODE_MAX_BW_CHANGED = BIT(0), + STA_OPMODE_SMPS_MODE_CHANGED = BIT(1), + STA_OPMODE_N_SS_CHANGED = BIT(2), +}; + +/** + * struct sta_opmode_info - Station's ht/vht operation mode information + * @changed: contains value from &enum wiphy_opmode_flag + * @smps_mode: New SMPS mode of a station + * @bw: new max bandwidth value of a station + * @rx_nss: new rx_nss value of a station + */ + +struct sta_opmode_info { + u32 changed; + u8 smps_mode; + u8 bw; + u8 rx_nss; +}; + /** * struct wiphy_vendor_command - vendor command definition * @info: vendor command identifying information, as used in nl80211 @@ -5721,6 +5750,20 @@ void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp); void cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, gfp_t gfp); +/** + * cfg80211_sta_opmode_change_notify - STA's ht/vht operation mode change event + * @dev: network device + * @mac: MAC address of a station which opmode got modified + * @sta_opmode: station's current opmode value + * @gfp: context flags + * + * Driver should call this function when station's opmode modified via action + * frame. + */ +void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, + struct sta_opmode_info *sta_opmode, + gfp_t gfp); + /** * cfg80211_cac_event - Channel availability check (CAC) event * @netdev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c2342456cf16..ca3d5a613fc0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1013,6 +1013,11 @@ * user space through the connect result as the user space would have * initiated the connection through the connect request. * + * @NL80211_CMD_STA_OPMODE_CHANGED: An event that notify station's + * ht opmode or vht opmode changes using any of &NL80211_ATTR_SMPS_MODE, + * &NL80211_ATTR_CHANNEL_WIDTH,&NL80211_ATTR_NSS attributes with its + * address(specified in &NL80211_ATTR_MAC). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1221,6 +1226,8 @@ enum nl80211_commands { NL80211_CMD_EXTERNAL_AUTH, + NL80211_CMD_STA_OPMODE_CHANGED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2186,6 +2193,9 @@ enum nl80211_commands { * authentication processing to user space if this capability is indicated * in NL80211_CMD_CONNECT requests from the user space. * + * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this + * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2615,6 +2625,8 @@ enum nl80211_attrs { NL80211_ATTR_EXTERNAL_AUTH_ACTION, NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, + NL80211_ATTR_NSS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bdb70fe74e3c..cc6ec5bab676 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14950,6 +14950,61 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, nlmsg_free(msg); } +void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac, + struct sta_opmode_info *sta_opmode, + gfp_t gfp) +{ + struct sk_buff *msg; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + void *hdr; + + if (WARN_ON(!mac)) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STA_OPMODE_CHANGED); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx)) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_SMPS_MODE_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_SMPS_MODE, sta_opmode->smps_mode)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw)) + goto nla_put_failure; + + if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) && + nla_put_u8(msg, NL80211_ATTR_NSS, sta_opmode->rx_nss)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, gfp); + + return; + +nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify); + void cfg80211_probe_status(struct net_device *dev, const u8 *addr, u64 cookie, bool acked, gfp_t gfp) { -- cgit v1.2.3-70-g09d2 From ff84e7bfe1763982e1ada390386dbe4739cee1e2 Mon Sep 17 00:00:00 2001 From: "tamizhr@codeaurora.org" Date: Wed, 31 Jan 2018 16:24:50 +0530 Subject: mac80211: Add support to notify ht/vht opmode modification. This will add support to send an event to a userspace application whenever station advertise its ht/vht opmode modification through an action frame. Signed-off-by: Tamizh chelvam Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 14 ++++++++++++++ net/mac80211/vht.c | 9 +++++++++ 2 files changed, 23 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fd580614085b..e755f93ad735 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2848,6 +2848,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) case WLAN_HT_ACTION_SMPS: { struct ieee80211_supported_band *sband; enum ieee80211_smps_mode smps_mode; + struct sta_opmode_info sta_opmode = {}; /* convert to HT capability */ switch (mgmt->u.action.u.ht_smps.smps_control) { @@ -2868,17 +2869,24 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (rx->sta->sta.smps_mode == smps_mode) goto handled; rx->sta->sta.smps_mode = smps_mode; + sta_opmode.smps_mode = smps_mode; + sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED; sband = rx->local->hw.wiphy->bands[status->band]; rate_control_rate_update(local, sband, rx->sta, IEEE80211_RC_SMPS_CHANGED); + cfg80211_sta_opmode_change_notify(sdata->dev, + rx->sta->addr, + &sta_opmode, + GFP_KERNEL); goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { struct ieee80211_supported_band *sband; u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; enum ieee80211_sta_rx_bandwidth max_bw, new_bw; + struct sta_opmode_info sta_opmode = {}; /* If it doesn't support 40 MHz it can't change ... */ if (!(rx->sta->sta.ht_cap.cap & @@ -2899,9 +2907,15 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) rx->sta->sta.bandwidth = new_bw; sband = rx->local->hw.wiphy->bands[status->band]; + sta_opmode.bw = new_bw; + sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; rate_control_rate_update(local, sband, rx->sta, IEEE80211_RC_BW_CHANGED); + cfg80211_sta_opmode_change_notify(sdata->dev, + rx->sta->addr, + &sta_opmode, + GFP_KERNEL); goto handled; } default: diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index b9276ac849fa..5714dee76b12 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -447,6 +447,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; + struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; @@ -460,7 +461,9 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, if (sta->sta.rx_nss != nss) { sta->sta.rx_nss = nss; + sta_opmode.rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; + sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { @@ -481,9 +484,15 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, new_bw = ieee80211_sta_cur_vht_bw(sta); if (new_bw != sta->sta.bandwidth) { sta->sta.bandwidth = new_bw; + sta_opmode.bw = new_bw; changed |= IEEE80211_RC_BW_CHANGED; + sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } + if (sta_opmode.changed) + cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr, + &sta_opmode, GFP_KERNEL); + return changed; } -- cgit v1.2.3-70-g09d2 From 62ebdc25c4002e5fc104ab536ce7d99ba76be03f Mon Sep 17 00:00:00 2001 From: Łukasz Rymanowski Date: Fri, 9 Feb 2018 18:26:02 +0100 Subject: Bluetooth: Fix incorrect bits for LE states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes incorrect checks for LE states. Issues found when doing mgmt tests for scenario when Linux Kernel should do connectable advertising while connected. Signed-off-by: Łukasz Rymanowski Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 3394e6791673..66c0781773df 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -934,8 +934,8 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) /* Slave connection state and connectable mode bit 38 * and scannable bit 21. */ - if (connectable && (!(hdev->le_states[4] & 0x01) || - !(hdev->le_states[2] & 0x40))) + if (connectable && (!(hdev->le_states[4] & 0x40) || + !(hdev->le_states[2] & 0x20))) return false; } @@ -948,7 +948,7 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) /* Master connection state and connectable mode bit 35 and * scannable 19. */ - if (connectable && (!(hdev->le_states[4] & 0x10) || + if (connectable && (!(hdev->le_states[4] & 0x08) || !(hdev->le_states[2] & 0x08))) return false; } -- cgit v1.2.3-70-g09d2 From 9b2c45d479d0fb8647c9e83359df69162b5fbe5f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 12 Feb 2018 20:00:20 +0100 Subject: net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko CC: David S. Miller CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller --- drivers/infiniband/hw/usnic/usnic_transport.c | 5 ++-- drivers/isdn/mISDN/socket.c | 5 ++-- drivers/net/ppp/pppoe.c | 6 ++--- drivers/net/ppp/pptp.c | 6 ++--- drivers/scsi/iscsi_tcp.c | 14 +++++------ drivers/soc/qcom/qmi_interface.c | 3 +-- drivers/staging/ipx/af_ipx.c | 6 ++--- drivers/staging/irda/net/af_irda.c | 8 +++--- drivers/staging/lustre/lnet/lnet/lib-socket.c | 7 +++--- drivers/target/iscsi/iscsi_target_login.c | 18 +++++++------- drivers/vhost/net.c | 7 +++--- fs/dlm/lowcomms.c | 7 +++--- fs/ocfs2/cluster/tcp.c | 6 ++--- include/linux/net.h | 8 +++--- include/net/inet_common.h | 2 +- include/net/ipv6.h | 2 +- include/net/sock.h | 2 +- net/appletalk/ddp.c | 5 ++-- net/atm/pvc.c | 5 ++-- net/atm/svc.c | 5 ++-- net/ax25/af_ax25.c | 4 +-- net/bluetooth/hci_sock.c | 4 +-- net/bluetooth/l2cap_sock.c | 5 ++-- net/bluetooth/rfcomm/sock.c | 5 ++-- net/bluetooth/sco.c | 5 ++-- net/can/raw.c | 6 ++--- net/core/sock.c | 5 ++-- net/decnet/af_decnet.c | 6 ++--- net/ipv4/af_inet.c | 5 ++-- net/ipv6/af_inet6.c | 5 ++-- net/iucv/af_iucv.c | 5 ++-- net/l2tp/l2tp_ip.c | 5 ++-- net/l2tp/l2tp_ip6.c | 5 ++-- net/l2tp/l2tp_ppp.c | 5 ++-- net/llc/af_llc.c | 5 ++-- net/netlink/af_netlink.c | 5 ++-- net/netrom/af_netrom.c | 9 ++++--- net/nfc/llcp_sock.c | 5 ++-- net/packet/af_packet.c | 10 +++----- net/phonet/socket.c | 5 ++-- net/qrtr/qrtr.c | 5 ++-- net/rds/af_rds.c | 5 ++-- net/rds/tcp.c | 7 ++---- net/rose/af_rose.c | 5 ++-- net/sctp/ipv6.c | 8 +++--- net/smc/af_smc.c | 11 ++++----- net/socket.c | 35 +++++++++++++-------------- net/sunrpc/clnt.c | 6 ++--- net/sunrpc/svcsock.c | 13 ++++++---- net/sunrpc/xprtsock.c | 3 +-- net/tipc/socket.c | 5 ++-- net/unix/af_unix.c | 10 ++++---- net/vmw_vsock/af_vsock.c | 4 +-- net/x25/af_x25.c | 4 +-- security/tomoyo/network.c | 5 ++-- 55 files changed, 159 insertions(+), 203 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c index de318389a301..67de94343cb4 100644 --- a/drivers/infiniband/hw/usnic/usnic_transport.c +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -174,14 +174,13 @@ void usnic_transport_put_socket(struct socket *sock) int usnic_transport_sock_get_addr(struct socket *sock, int *proto, uint32_t *addr, uint16_t *port) { - int len; int err; struct sockaddr_in sock_addr; err = sock->ops->getname(sock, (struct sockaddr *)&sock_addr, - &len, 0); - if (err) + 0); + if (err < 0) return err; if (sock_addr.sin_family != AF_INET) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index c5603d1a07d6..1f8f489b4167 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -560,7 +560,7 @@ done: static int data_sock_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; @@ -570,14 +570,13 @@ data_sock_getname(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - *addr_len = sizeof(*maddr); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); - return 0; + return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 5aa59f41bf8c..bd89d1c559ce 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -714,7 +714,7 @@ err_put: } static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, - int *usockaddr_len, int peer) + int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; @@ -726,9 +726,7 @@ static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(uaddr, &sp, len); - *usockaddr_len = len; - - return 0; + return len; } static int pppoe_ioctl(struct socket *sock, unsigned int cmd, diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c index 6dde9a0cfe76..8249d46a7844 100644 --- a/drivers/net/ppp/pptp.c +++ b/drivers/net/ppp/pptp.c @@ -483,7 +483,7 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, } static int pptp_getname(struct socket *sock, struct sockaddr *uaddr, - int *usockaddr_len, int peer) + int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; @@ -496,9 +496,7 @@ static int pptp_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(uaddr, &sp, len); - *usockaddr_len = len; - - return 0; + return len; } static int pptp_release(struct socket *sock) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 6198559abbd8..0ad00dbf912d 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -732,7 +732,7 @@ static int iscsi_sw_tcp_conn_get_param(struct iscsi_cls_conn *cls_conn, struct iscsi_tcp_conn *tcp_conn = conn->dd_data; struct iscsi_sw_tcp_conn *tcp_sw_conn = tcp_conn->dd_data; struct sockaddr_in6 addr; - int rc, len; + int rc; switch(param) { case ISCSI_PARAM_CONN_PORT: @@ -745,12 +745,12 @@ static int iscsi_sw_tcp_conn_get_param(struct iscsi_cls_conn *cls_conn, } if (param == ISCSI_PARAM_LOCAL_PORT) rc = kernel_getsockname(tcp_sw_conn->sock, - (struct sockaddr *)&addr, &len); + (struct sockaddr *)&addr); else rc = kernel_getpeername(tcp_sw_conn->sock, - (struct sockaddr *)&addr, &len); + (struct sockaddr *)&addr); spin_unlock_bh(&conn->session->frwd_lock); - if (rc) + if (rc < 0) return rc; return iscsi_conn_get_addr_param((struct sockaddr_storage *) @@ -771,7 +771,7 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost, struct iscsi_tcp_conn *tcp_conn; struct iscsi_sw_tcp_conn *tcp_sw_conn; struct sockaddr_in6 addr; - int rc, len; + int rc; switch (param) { case ISCSI_HOST_PARAM_IPADDRESS: @@ -793,9 +793,9 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost, } rc = kernel_getsockname(tcp_sw_conn->sock, - (struct sockaddr *)&addr, &len); + (struct sockaddr *)&addr); spin_unlock_bh(&session->frwd_lock); - if (rc) + if (rc < 0) return rc; return iscsi_conn_get_addr_param((struct sockaddr_storage *) diff --git a/drivers/soc/qcom/qmi_interface.c b/drivers/soc/qcom/qmi_interface.c index 877611d5c42b..321982277697 100644 --- a/drivers/soc/qcom/qmi_interface.c +++ b/drivers/soc/qcom/qmi_interface.c @@ -586,7 +586,6 @@ static struct socket *qmi_sock_create(struct qmi_handle *qmi, struct sockaddr_qrtr *sq) { struct socket *sock; - int sl = sizeof(*sq); int ret; ret = sock_create_kern(&init_net, AF_QIPCRTR, SOCK_DGRAM, @@ -594,7 +593,7 @@ static struct socket *qmi_sock_create(struct qmi_handle *qmi, if (ret < 0) return ERR_PTR(ret); - ret = kernel_getsockname(sock, (struct sockaddr *)sq, &sl); + ret = kernel_getsockname(sock, (struct sockaddr *)sq); if (ret < 0) { sock_release(sock); return ERR_PTR(ret); diff --git a/drivers/staging/ipx/af_ipx.c b/drivers/staging/ipx/af_ipx.c index d21a9d128d3e..5703dd176787 100644 --- a/drivers/staging/ipx/af_ipx.c +++ b/drivers/staging/ipx/af_ipx.c @@ -1577,7 +1577,7 @@ out: static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct ipx_address *addr; struct sockaddr_ipx sipx; @@ -1585,8 +1585,6 @@ static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, struct ipx_sock *ipxs = ipx_sk(sk); int rc; - *uaddr_len = sizeof(struct sockaddr_ipx); - lock_sock(sk); if (peer) { rc = -ENOTCONN; @@ -1620,7 +1618,7 @@ static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, sipx.sipx_zero = 0; memcpy(uaddr, &sipx, sizeof(sipx)); - rc = 0; + rc = sizeof(struct sockaddr_ipx); out: release_sock(sk); return rc; diff --git a/drivers/staging/irda/net/af_irda.c b/drivers/staging/irda/net/af_irda.c index 2f1e9ab3d6d0..c13553a9ee11 100644 --- a/drivers/staging/irda/net/af_irda.c +++ b/drivers/staging/irda/net/af_irda.c @@ -697,7 +697,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) * */ static int irda_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_irda saddr; struct sock *sk = sock->sk; @@ -720,11 +720,9 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr, pr_debug("%s(), tsap_sel = %#x\n", __func__, saddr.sir_lsap_sel); pr_debug("%s(), addr = %08x\n", __func__, saddr.sir_addr); - /* uaddr_len come to us uninitialised */ - *uaddr_len = sizeof (struct sockaddr_irda); - memcpy(uaddr, &saddr, *uaddr_len); + memcpy(uaddr, &saddr, sizeof (struct sockaddr_irda)); - return 0; + return sizeof (struct sockaddr_irda); } /* diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c index ce93806eefca..1bee667802b0 100644 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c @@ -448,14 +448,13 @@ int lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port) { struct sockaddr_in sin; - int len = sizeof(sin); int rc; if (remote) - rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len); + rc = kernel_getpeername(sock, (struct sockaddr *)&sin); else - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len); - if (rc) { + rc = kernel_getsockname(sock, (struct sockaddr *)&sin); + if (rc < 0) { CERROR("Error %d getting sock %s IP/port\n", rc, remote ? "peer" : "local"); return rc; diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 64c5a57b92e4..99501785cdc1 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1020,7 +1020,7 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) struct socket *new_sock, *sock = np->np_socket; struct sockaddr_in sock_in; struct sockaddr_in6 sock_in6; - int rc, err; + int rc; rc = kernel_accept(sock, &new_sock, 0); if (rc < 0) @@ -1033,8 +1033,8 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) memset(&sock_in6, 0, sizeof(struct sockaddr_in6)); rc = conn->sock->ops->getname(conn->sock, - (struct sockaddr *)&sock_in6, &err, 1); - if (!rc) { + (struct sockaddr *)&sock_in6, 1); + if (rc >= 0) { if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) { memcpy(&conn->login_sockaddr, &sock_in6, sizeof(sock_in6)); } else { @@ -1047,8 +1047,8 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) } rc = conn->sock->ops->getname(conn->sock, - (struct sockaddr *)&sock_in6, &err, 0); - if (!rc) { + (struct sockaddr *)&sock_in6, 0); + if (rc >= 0) { if (!ipv6_addr_v4mapped(&sock_in6.sin6_addr)) { memcpy(&conn->local_sockaddr, &sock_in6, sizeof(sock_in6)); } else { @@ -1063,13 +1063,13 @@ int iscsit_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) memset(&sock_in, 0, sizeof(struct sockaddr_in)); rc = conn->sock->ops->getname(conn->sock, - (struct sockaddr *)&sock_in, &err, 1); - if (!rc) + (struct sockaddr *)&sock_in, 1); + if (rc >= 0) memcpy(&conn->login_sockaddr, &sock_in, sizeof(sock_in)); rc = conn->sock->ops->getname(conn->sock, - (struct sockaddr *)&sock_in, &err, 0); - if (!rc) + (struct sockaddr *)&sock_in, 0); + if (rc >= 0) memcpy(&conn->local_sockaddr, &sock_in, sizeof(sock_in)); } diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 610cba276d47..b5fb56b822fd 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1038,7 +1038,7 @@ static struct socket *get_raw_socket(int fd) struct sockaddr_ll sa; char buf[MAX_ADDR_LEN]; } uaddr; - int uaddr_len = sizeof uaddr, r; + int r; struct socket *sock = sockfd_lookup(fd, &r); if (!sock) @@ -1050,9 +1050,8 @@ static struct socket *get_raw_socket(int fd) goto err; } - r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, - &uaddr_len, 0); - if (r) + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0); + if (r < 0) goto err; if (uaddr.sa.sll_family != AF_PACKET) { diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cff79ea0c01d..5243989a60cc 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -482,7 +482,6 @@ static void lowcomms_error_report(struct sock *sk) { struct connection *con; struct sockaddr_storage saddr; - int buflen; void (*orig_report)(struct sock *) = NULL; read_lock_bh(&sk->sk_callback_lock); @@ -492,7 +491,7 @@ static void lowcomms_error_report(struct sock *sk) orig_report = listen_sock.sk_error_report; if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { + kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), @@ -757,8 +756,8 @@ static int tcp_accept_from_sock(struct connection *con) /* Get the connected socket's peer */ memset(&peeraddr, 0, sizeof(peeraddr)); - if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, - &len, 2)) { + len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); + if (len < 0) { result = -ECONNABORTED; goto accept_err; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eac5140aac47..e5076185cc1e 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1819,7 +1819,7 @@ int o2net_register_hb_callbacks(void) static int o2net_accept_one(struct socket *sock, int *more) { - int ret, slen; + int ret; struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; @@ -1864,9 +1864,7 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) goto out; diff --git a/include/linux/net.h b/include/linux/net.h index 91216b16feb7..000d1aada74f 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -146,7 +146,7 @@ struct proto_ops { struct socket *newsock, int flags, bool kern); int (*getname) (struct socket *sock, struct sockaddr *addr, - int *sockaddr_len, int peer); + int peer); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, @@ -294,10 +294,8 @@ int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); -int kernel_getsockname(struct socket *sock, struct sockaddr *addr, - int *addrlen); -int kernel_getpeername(struct socket *sock, struct sockaddr *addr, - int *addrlen); +int kernel_getsockname(struct socket *sock, struct sockaddr *addr); +int kernel_getpeername(struct socket *sock, struct sockaddr *addr); int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 5a54c9570977..500f81375200 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -32,7 +32,7 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, +int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8606c9113d3f..7a98cd583c73 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1056,7 +1056,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, +int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/sock.h b/include/net/sock.h index 169c92afcafa..3aa7b7d6e6c7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1584,7 +1584,7 @@ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); -int sock_no_getname(struct socket *, struct sockaddr *, int *, int); +int sock_no_getname(struct socket *, struct sockaddr *, int); __poll_t sock_no_poll(struct file *, struct socket *, struct poll_table_struct *); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 03a9fc0771c0..9b6bc5abe946 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1238,7 +1238,7 @@ out: * fields into the sockaddr. */ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_at sat; struct sock *sk = sock->sk; @@ -1251,7 +1251,6 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, if (atalk_autobind(sk) < 0) goto out; - *uaddr_len = sizeof(struct sockaddr_at); memset(&sat, 0, sizeof(sat)); if (peer) { @@ -1268,9 +1267,9 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, sat.sat_port = at->src_port; } - err = 0; sat.sat_family = AF_APPLETALK; memcpy(uaddr, &sat, sizeof(sat)); + err = sizeof(struct sockaddr_at); out: release_sock(sk); diff --git a/net/atm/pvc.c b/net/atm/pvc.c index e1140b3bdcaa..2cb10af16afc 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -87,21 +87,20 @@ static int pvc_getsockopt(struct socket *sock, int level, int optname, } static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, - int *sockaddr_len, int peer) + int peer) { struct sockaddr_atmpvc *addr; struct atm_vcc *vcc = ATM_SD(sock); if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; - *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; addr->sap_addr.vci = vcc->vci; - return 0; + return sizeof(struct sockaddr_atmpvc); } static const struct proto_ops pvc_proto_ops = { diff --git a/net/atm/svc.c b/net/atm/svc.c index c458adcbc177..2f91b766ac42 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -419,15 +419,14 @@ out: } static int svc_getname(struct socket *sock, struct sockaddr *sockaddr, - int *sockaddr_len, int peer) + int peer) { struct sockaddr_atmsvc *addr; - *sockaddr_len = sizeof(struct sockaddr_atmsvc); addr = (struct sockaddr_atmsvc *) sockaddr; memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, sizeof(struct sockaddr_atmsvc)); - return 0; + return sizeof(struct sockaddr_atmsvc); } int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos) diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 47fdd399626b..c8319ed48485 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1388,7 +1388,7 @@ out: } static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; @@ -1427,7 +1427,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, fsa->fsa_digipeater[0] = null_ax25_address; } } - *uaddr_len = sizeof (struct full_sockaddr_ax25); + err = sizeof (struct full_sockaddr_ax25); out: release_sock(sk); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 923e9a271872..1506e1632394 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1340,7 +1340,7 @@ done: } static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr; struct sock *sk = sock->sk; @@ -1360,10 +1360,10 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, goto done; } - *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; haddr->hci_channel= hci_pi(sk)->channel; + err = sizeof(*haddr); done: release_sock(sk); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 67a8642f57ea..686bdc6b35b0 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -358,7 +358,7 @@ done: } static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; struct sock *sk = sock->sk; @@ -373,7 +373,6 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; - *len = sizeof(struct sockaddr_l2); la->l2_psm = chan->psm; @@ -387,7 +386,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, la->l2_bdaddr_type = chan->src_type; } - return 0; + return sizeof(struct sockaddr_l2); } static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 1aaccf637479..93a3b219db09 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -533,7 +533,7 @@ done: return err; } -static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; @@ -552,8 +552,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * else bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src); - *len = sizeof(struct sockaddr_rc); - return 0; + return sizeof(struct sockaddr_rc); } static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 08df57665e1f..413b8ee49fec 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -680,7 +680,7 @@ done: } static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; @@ -688,14 +688,13 @@ static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; - *len = sizeof(struct sockaddr_sco); if (peer) bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst); else bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src); - return 0; + return sizeof(struct sockaddr_sco); } static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, diff --git a/net/can/raw.c b/net/can/raw.c index f2ecc43376a1..1051eee82581 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -470,7 +470,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) } static int raw_getname(struct socket *sock, struct sockaddr *uaddr, - int *len, int peer) + int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; @@ -483,9 +483,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr, addr->can_family = AF_CAN; addr->can_ifindex = ro->ifindex; - *len = sizeof(*addr); - - return 0; + return sizeof(*addr); } static int raw_setsockopt(struct socket *sock, int level, int optname, diff --git a/net/core/sock.c b/net/core/sock.c index c501499a04fe..04e5e27c9b81 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1274,7 +1274,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, { char address[128]; - if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); + if (lv < 0) return -ENOTCONN; if (lv < len) return -EINVAL; @@ -2497,7 +2498,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, - int *len, int peer) + int peer) { return -EOPNOTSUPP; } diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 91dd09f79808..45cb5bea884b 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1180,14 +1180,12 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags, } -static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer) +static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer) { struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr; struct sock *sk = sock->sk; struct dn_scp *scp = DN_SK(sk); - *uaddr_len = sizeof(struct sockaddr_dn); - lock_sock(sk); if (peer) { @@ -1205,7 +1203,7 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len release_sock(sk); - return 0; + return sizeof(struct sockaddr_dn); } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e4329e161943..f98e2f0db841 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -723,7 +723,7 @@ EXPORT_SYMBOL(inet_accept); * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -745,8 +745,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_addr.s_addr = addr; } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } EXPORT_SYMBOL(inet_getname); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 416917719a6f..c1e292db04db 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -470,7 +470,7 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); */ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -500,8 +500,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 1e8cc7bcbca3..81ce15ffb878 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -989,14 +989,13 @@ done: } static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr; struct sock *sk = sock->sk; struct iucv_sock *iucv = iucv_sk(sk); addr->sa_family = AF_IUCV; - *len = sizeof(struct sockaddr_iucv); if (peer) { memcpy(siucv->siucv_user_id, iucv->dst_user_id, 8); @@ -1009,7 +1008,7 @@ static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr, memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr)); memset(&siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid)); - return 0; + return sizeof(struct sockaddr_iucv); } /** diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index ff61124fdf59..4614585e1720 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -349,7 +349,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags) } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -370,8 +370,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } - *uaddr_len = sizeof(*lsa); - return 0; + return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 192344688c06..efea58b66295 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -421,7 +421,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags) } static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; struct sock *sk = sock->sk; @@ -449,8 +449,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, } if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL) lsa->l2tp_scope_id = sk->sk_bound_dev_if; - *uaddr_len = sizeof(*lsa); - return 0; + return sizeof(*lsa); } static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 59f246d7b290..99a03c72db4f 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -870,7 +870,7 @@ err: /* getname() support. */ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, - int *usockaddr_len, int peer) + int peer) { int len = 0; int error = 0; @@ -969,8 +969,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, memcpy(uaddr, &sp, len); } - *usockaddr_len = len; - error = 0; + error = len; sock_put(sk); end: diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c38d16f22d2a..01dcc0823d1f 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -971,7 +971,7 @@ release: * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddrlen, int peer) + int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; @@ -982,7 +982,6 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; - *uaddrlen = sizeof(sllc); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) @@ -1003,9 +1002,9 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, IFHWADDRLEN); } } - rc = 0; sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); + rc = sizeof(sllc); out: release_sock(sk); return rc; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 2ad445c1d27c..3c8af14330b5 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1105,7 +1105,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, } static int netlink_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); @@ -1113,7 +1113,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; - *addr_len = sizeof(*nladdr); if (peer) { nladdr->nl_pid = nlk->dst_portid; @@ -1124,7 +1123,7 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } - return 0; + return sizeof(*nladdr); } static int netlink_ioctl(struct socket *sock, unsigned int cmd, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 9ba30c63be3d..35bb6807927f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -829,11 +829,12 @@ out_release: } static int nr_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); + int uaddr_len; memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25)); @@ -848,16 +849,16 @@ static int nr_getname(struct socket *sock, struct sockaddr *uaddr, sax->fsa_ax25.sax25_call = nr->user_addr; memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater)); sax->fsa_digipeater[0] = nr->dest_addr; - *uaddr_len = sizeof(struct full_sockaddr_ax25); + uaddr_len = sizeof(struct full_sockaddr_ax25); } else { sax->fsa_ax25.sax25_family = AF_NETROM; sax->fsa_ax25.sax25_ndigis = 0; sax->fsa_ax25.sax25_call = nr->source_addr; - *uaddr_len = sizeof(struct sockaddr_ax25); + uaddr_len = sizeof(struct sockaddr_ax25); } release_sock(sk); - return 0; + return uaddr_len; } int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 376040092142..ea0c0c6f1874 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -497,7 +497,7 @@ error: } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, - int *len, int peer) + int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -510,7 +510,6 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); - *len = sizeof(struct sockaddr_nfc_llcp); lock_sock(sk); if (!llcp_sock->dev) { @@ -528,7 +527,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, llcp_addr->service_name_len); release_sock(sk); - return 0; + return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e0f3f4aeeb4f..616cb9c18f88 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3409,7 +3409,7 @@ out: } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct net_device *dev; struct sock *sk = sock->sk; @@ -3424,13 +3424,12 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); - *uaddr_len = sizeof(*uaddr); - return 0; + return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct net_device *dev; struct sock *sk = sock->sk; @@ -3455,9 +3454,8 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, sll->sll_halen = 0; } rcu_read_unlock(); - *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; - return 0; + return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index fffcd69f63ff..f9b40e6a18a5 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -326,7 +326,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock, } static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, - int *sockaddr_len, int peer) + int peer) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -337,8 +337,7 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, pn_sockaddr_set_object((struct sockaddr_pn *)addr, pn->sobject); - *sockaddr_len = sizeof(struct sockaddr_pn); - return 0; + return sizeof(struct sockaddr_pn); } static __poll_t pn_socket_poll(struct file *file, struct socket *sock, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 5fb3929e3d7d..b33e5aeb4c06 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -893,7 +893,7 @@ static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, } static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, - int *len, int peer) + int peer) { struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sockaddr_qrtr qaddr; @@ -912,12 +912,11 @@ static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, } release_sock(sk); - *len = sizeof(qaddr); qaddr.sq_family = AF_QIPCRTR; memcpy(saddr, &qaddr, sizeof(qaddr)); - return 0; + return sizeof(qaddr); } static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 744c637c86b0..0a8eefd256b3 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -110,7 +110,7 @@ void rds_wake_sk_sleep(struct rds_sock *rs) } static int rds_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sock->sk); @@ -131,8 +131,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_family = AF_INET; - *uaddr_len = sizeof(*sin); - return 0; + return sizeof(*sin); } /* diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 44c4652721af..08230a145042 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -227,7 +227,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, struct rds_tcp_connection *tc; unsigned long flags; struct sockaddr_in sin; - int sinlen; struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -239,12 +238,10 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, sock = tc->t_sock; if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 0); + sock->ops->getname(sock, (struct sockaddr *)&sin, 0); tsinfo.local_addr = sin.sin_addr.s_addr; tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 1); + sock->ops->getname(sock, (struct sockaddr *)&sin, 1); tsinfo.peer_addr = sin.sin_addr.s_addr; tsinfo.peer_port = sin.sin_port; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 083bd251406f..5170373b797c 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -938,7 +938,7 @@ out_release: } static int rose_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; struct sock *sk = sock->sk; @@ -964,8 +964,7 @@ static int rose_getname(struct socket *sock, struct sockaddr *uaddr, srose->srose_digis[n] = rose->source_digis[n]; } - *uaddr_len = sizeof(struct full_sockaddr_rose); - return 0; + return sizeof(struct full_sockaddr_rose); } int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e35d4f73d2df..0d873c58e516 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -952,16 +952,16 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { int rc; - rc = inet6_getname(sock, uaddr, uaddr_len, peer); + rc = inet6_getname(sock, uaddr, peer); - if (rc != 0) + if (rc < 0) return rc; - *uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk), + rc = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da1a5cdefd13..38ae22b65e77 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -281,7 +281,6 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, struct in_device *in_dev; struct sockaddr_in addr; int rc = -ENOENT; - int len; if (!dst) { rc = -ENOTCONN; @@ -293,7 +292,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock, } /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); + kernel_getsockname(clcsock, (struct sockaddr *)&addr); /* analyze IPv4 specific data of net_device belonging to TCP socket */ rcu_read_lock(); in_dev = __in_dev_get_rcu(dst->dev); @@ -771,7 +770,7 @@ static void smc_listen_work(struct work_struct *work) u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; - int rc = 0, len; + int rc = 0; __be32 subnet; u8 prefix_len; u8 ibport; @@ -824,7 +823,7 @@ static void smc_listen_work(struct work_struct *work) } /* get address of the peer connected to the internal TCP socket */ - kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); + kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr); /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); @@ -1075,7 +1074,7 @@ out: } static int smc_getname(struct socket *sock, struct sockaddr *addr, - int *len, int peer) + int peer) { struct smc_sock *smc; @@ -1085,7 +1084,7 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, smc = smc_sk(sock->sk); - return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer); + return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) diff --git a/net/socket.c b/net/socket.c index a93c99b518ca..fac8246a8ae8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1573,8 +1573,9 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, goto out_fd; if (upeer_sockaddr) { - if (newsock->ops->getname(newsock, (struct sockaddr *)&address, - &len, 2) < 0) { + len = newsock->ops->getname(newsock, + (struct sockaddr *)&address, 2); + if (len < 0) { err = -ECONNABORTED; goto out_fd; } @@ -1654,7 +1655,7 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, { struct socket *sock; struct sockaddr_storage address; - int len, err, fput_needed; + int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) @@ -1664,10 +1665,11 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, if (err) goto out_put; - err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0); - if (err) + err = sock->ops->getname(sock, (struct sockaddr *)&address, 0); + if (err < 0) goto out_put; - err = move_addr_to_user(&address, len, usockaddr, usockaddr_len); + /* "err" is actually length in this case */ + err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); out_put: fput_light(sock->file, fput_needed); @@ -1685,7 +1687,7 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, { struct socket *sock; struct sockaddr_storage address; - int len, err, fput_needed; + int err, fput_needed; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock != NULL) { @@ -1695,11 +1697,10 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, return err; } - err = - sock->ops->getname(sock, (struct sockaddr *)&address, &len, - 1); - if (!err) - err = move_addr_to_user(&address, len, usockaddr, + err = sock->ops->getname(sock, (struct sockaddr *)&address, 1); + if (err >= 0) + /* "err" is actually length in this case */ + err = move_addr_to_user(&address, err, usockaddr, usockaddr_len); fput_light(sock->file, fput_needed); } @@ -3166,17 +3167,15 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, } EXPORT_SYMBOL(kernel_connect); -int kernel_getsockname(struct socket *sock, struct sockaddr *addr, - int *addrlen) +int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, addrlen, 0); + return sock->ops->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname); -int kernel_getpeername(struct socket *sock, struct sockaddr *addr, - int *addrlen) +int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { - return sock->ops->getname(sock, addr, addrlen, 1); + return sock->ops->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6e432ecd7f99..806395687bb6 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1231,7 +1231,7 @@ static const struct sockaddr_in6 rpc_in6addr_loopback = { * negative errno is returned. */ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, - struct sockaddr *buf, int buflen) + struct sockaddr *buf) { struct socket *sock; int err; @@ -1269,7 +1269,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_getsockname(sock, buf, &buflen); + err = kernel_getsockname(sock, buf); if (err < 0) { dprintk("RPC: getsockname failed (%d)\n", err); goto out_release; @@ -1353,7 +1353,7 @@ int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen) rcu_read_unlock(); rpc_set_port(sap, 0); - err = rpc_sockname(net, sap, salen, buf, buflen); + err = rpc_sockname(net, sap, salen, buf); put_net(net); if (err != 0) /* Couldn't discover local address, return ANYADDR */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 943f2a745cd5..08cd951aaeea 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -832,12 +832,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - err = kernel_getpeername(newsock, sin, &slen); + err = kernel_getpeername(newsock, sin); if (err < 0) { net_warn_ratelimited("%s: peername failed (err %d)!\n", serv->sv_name, -err); goto failed; /* aborted connection or whatever */ } + slen = err; /* Ideally, we would want to reject connections from unauthorized * hosts here, but when we get encryption, the IP of the host won't @@ -866,7 +867,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (IS_ERR(newsvsk)) goto failed; svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); - err = kernel_getsockname(newsock, sin, &slen); + err = kernel_getsockname(newsock, sin); + slen = err; if (unlikely(err < 0)) { dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); slen = offsetof(struct sockaddr, sa_data); @@ -1465,7 +1467,8 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, err = PTR_ERR(svsk); goto out; } - if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + salen = kernel_getsockname(svsk->sk_sock, sin); + if (salen >= 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); svc_add_new_perm_xprt(serv, &svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); @@ -1539,10 +1542,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (error < 0) goto bummer; - newlen = len; - error = kernel_getsockname(sock, newsin, &newlen); + error = kernel_getsockname(sock, newsin); if (error < 0) goto bummer; + newlen = error; if (protocol == IPPROTO_TCP) { if ((error = kernel_listen(sock, 64)) < 0) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a6b8c1f8f92a..956e29c1438d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1794,10 +1794,9 @@ static void xs_sock_set_reuseport(struct socket *sock) static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; - int buflen; unsigned short port = 0; - if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0) + if (kernel_getsockname(sock, (struct sockaddr *)&buf) < 0) goto out; switch (buf.ss_family) { case AF_INET6: diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b0323ec7971e..f93477187a90 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -665,7 +665,7 @@ exit: * a completely predictable manner). */ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct sock *sk = sock->sk; @@ -684,13 +684,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, addr->addr.id.node = tn->own_addr; } - *uaddr_len = sizeof(*addr); addr->addrtype = TIPC_ADDR_ID; addr->family = AF_TIPC; addr->scope = 0; addr->addr.name.domain = 0; - return 0; + return sizeof(*addr); } /** diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d545e1d0dea2..723698416242 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -637,7 +637,7 @@ static int unix_stream_connect(struct socket *, struct sockaddr *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, int, bool); -static int unix_getname(struct socket *, struct sockaddr *, int *, int); +static int unix_getname(struct socket *, struct sockaddr *, int); static __poll_t unix_poll(struct file *, struct socket *, poll_table *); static __poll_t unix_dgram_poll(struct file *, struct socket *, poll_table *); @@ -1453,7 +1453,7 @@ out: } -static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct unix_sock *u; @@ -1476,12 +1476,12 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ if (!u->addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; - *uaddr_len = sizeof(short); + err = sizeof(short); } else { struct unix_address *addr = u->addr; - *uaddr_len = addr->len; - memcpy(sunaddr, addr->name, *uaddr_len); + err = addr->len; + memcpy(sunaddr, addr->name, addr->len); } unix_state_unlock(sk); sock_put(sk); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index e0fc84daed94..aac9b8f6552e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -759,7 +759,7 @@ vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } static int vsock_getname(struct socket *sock, - struct sockaddr *addr, int *addr_len, int peer) + struct sockaddr *addr, int peer) { int err; struct sock *sk; @@ -794,7 +794,7 @@ static int vsock_getname(struct socket *sock, */ BUILD_BUG_ON(sizeof(*vm_addr) > 128); memcpy(addr, vm_addr, sizeof(*vm_addr)); - *addr_len = sizeof(*vm_addr); + err = sizeof(*vm_addr); out: release_sock(sk); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 562cc11131f6..d49aa79b7997 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -896,7 +896,7 @@ out: } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, - int *uaddr_len, int peer) + int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; @@ -913,7 +913,7 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr, sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; - *uaddr_len = sizeof(*sx25); + rc = sizeof(*sx25); out: return rc; diff --git a/security/tomoyo/network.c b/security/tomoyo/network.c index cd6932e5225c..9094f4b3b367 100644 --- a/security/tomoyo/network.c +++ b/security/tomoyo/network.c @@ -655,10 +655,11 @@ int tomoyo_socket_listen_permission(struct socket *sock) return 0; { const int error = sock->ops->getname(sock, (struct sockaddr *) - &addr, &addr_len, 0); + &addr, 0); - if (error) + if (error < 0) return error; + addr_len = error; } address.protocol = type; address.operation = TOMOYO_NETWORK_LISTEN; -- cgit v1.2.3-70-g09d2 From 60aa80460da115216070082b4ab686b9e37c86ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 30 Jan 2018 14:53:48 +0000 Subject: esp4: remove redundant initialization of pointer esph Pointer esph is being assigned a value that is never read, esph is re-assigned and only read inside an if statement, hence the initialization is redundant and can be removed. Cleans up clang warning: net/ipv4/esp4.c:657:21: warning: Value stored to 'esph' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 296d0b956bfe..97689012b357 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -654,7 +654,7 @@ static void esp_input_restore_header(struct sk_buff *skb) static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi) { struct xfrm_state *x = xfrm_input_state(skb); - struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data; + struct ip_esp_hdr *esph; /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after -- cgit v1.2.3-70-g09d2 From 98f6c533a3e98f21305575f0cf87cdb6c2210c43 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:02 +0300 Subject: net: Assign net to net_namespace_list in setup_net() This patch merges two repeating pieces of code in one, and they will live in setup_net() now. The only change is that assignment: init_net_initialized = true; becomes reordered with: list_add_tail_rcu(&net->list, &net_namespace_list); The order does not have visible effect, and it is a simple cleanup because of: init_net_initialized is used in !CONFIG_NET_NS case to order proc_net_ns_ops registration occuring at boot time: start_kernel()->proc_root_init()->proc_net_init(), with net_ns_init()->setup_net(&init_net, &init_user_ns) also occuring in boot time from the same init_task. When there are no another tasks to race with them, for the single task it does not matter, which order two sequential independent loads should be made. So we make them reordered. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/net_namespace.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 3cad5f51afd3..1180c217895a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -303,6 +303,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) if (error < 0) goto out_undo; } + rtnl_lock(); + list_add_tail_rcu(&net->list, &net_namespace_list); + rtnl_unlock(); out: return error; @@ -424,11 +427,6 @@ struct net *copy_net_ns(unsigned long flags, net->ucounts = ucounts; rv = setup_net(net, user_ns); - if (rv == 0) { - rtnl_lock(); - list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); - } mutex_unlock(&net_mutex); if (rv < 0) { dec_net_namespaces(ucounts); @@ -880,11 +878,6 @@ static int __init net_ns_init(void) panic("Could not setup the initial network namespace"); init_net_initialized = true; - - rtnl_lock(); - list_add_tail_rcu(&init_net.list, &net_namespace_list); - rtnl_unlock(); - mutex_unlock(&net_mutex); register_pernet_subsys(&net_ns_ops); -- cgit v1.2.3-70-g09d2 From 5ba049a5cc8e24a1643df75bbf65b4efa070fa74 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:13 +0300 Subject: net: Cleanup in copy_net_ns() Line up destructors actions in the revers order to constructors. Next patches will add more actions, and this will be comfortable, if there is the such order. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/net_namespace.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1180c217895a..81384386f91b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -411,27 +411,25 @@ struct net *copy_net_ns(unsigned long flags, net = net_alloc(); if (!net) { - dec_net_namespaces(ucounts); - return ERR_PTR(-ENOMEM); + rv = -ENOMEM; + goto dec_ucounts; } - + refcount_set(&net->passive, 1); + net->ucounts = ucounts; get_user_ns(user_ns); rv = mutex_lock_killable(&net_mutex); - if (rv < 0) { - net_free(net); - dec_net_namespaces(ucounts); - put_user_ns(user_ns); - return ERR_PTR(rv); - } + if (rv < 0) + goto put_userns; - net->ucounts = ucounts; rv = setup_net(net, user_ns); mutex_unlock(&net_mutex); if (rv < 0) { - dec_net_namespaces(ucounts); +put_userns: put_user_ns(user_ns); net_drop_ns(net); +dec_ucounts: + dec_net_namespaces(ucounts); return ERR_PTR(rv); } return net; -- cgit v1.2.3-70-g09d2 From 1a57feb847c56d6193f67d0e892c24e71f9e3ab1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:23 +0300 Subject: net: Introduce net_sem for protection of pernet_list Currently, the mutex is mostly used to protect pernet operations list. It orders setup_net() and cleanup_net() with parallel {un,}register_pernet_operations() calls, so ->exit{,batch} methods of the same pernet operations are executed for a dying net, as were used to call ->init methods, even after the net namespace is unlinked from net_namespace_list in cleanup_net(). But there are several problems with scalability. The first one is that more than one net can't be created or destroyed at the same moment on the node. For big machines with many cpus running many containers it's very sensitive. The second one is that it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop with preemptible RCU enabled after CRIU tests round is finished. The solution is to convert net_mutex to the rw_semaphore and add fine grain locks to really small number of pernet_operations, what really need them. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list (i.e., when modules are being loaded and unloaded). This gives signify performance increase, after all patch set is applied, like you may see here: %for i in {1..10000}; do unshare -n bash -c exit; done *before* real 1m40,377s user 0m9,672s sys 0m19,928s *after* real 0m17,007s user 0m5,311s sys 0m11,779 (5.8 times faster) This patch starts replacing net_mutex to net_sem. It adds rw_semaphore, describes the variables it protects, and makes to use, where appropriate. net_mutex is still present, and next patches will kick it out step-by-step. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + net/core/net_namespace.c | 39 ++++++++++++++++++++++++++------------- net/core/rtnetlink.c | 4 ++-- 3 files changed, 29 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1fdcde96eb65..e9ee9ad0a681 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,6 +36,7 @@ extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct mutex net_mutex; +extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 81384386f91b..e89b2b7abd36 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -41,6 +41,11 @@ struct net init_net = { EXPORT_SYMBOL(init_net); static bool init_net_initialized; +/* + * net_sem: protects: pernet_list, net_generic_ids, + * init_net_initialized and first_device pointer. + */ +DECLARE_RWSEM(net_sem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -286,7 +291,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { - /* Must be called with net_mutex held */ + /* Must be called with net_sem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); @@ -418,12 +423,16 @@ struct net *copy_net_ns(unsigned long flags, net->ucounts = ucounts; get_user_ns(user_ns); - rv = mutex_lock_killable(&net_mutex); + rv = down_read_killable(&net_sem); if (rv < 0) goto put_userns; - + rv = mutex_lock_killable(&net_mutex); + if (rv < 0) + goto up_read; rv = setup_net(net, user_ns); mutex_unlock(&net_mutex); +up_read: + up_read(&net_sem); if (rv < 0) { put_userns: put_user_ns(user_ns); @@ -477,6 +486,7 @@ static void cleanup_net(struct work_struct *work) list_replace_init(&cleanup_list, &net_kill_list); spin_unlock_irq(&cleanup_list_lock); + down_read(&net_sem); mutex_lock(&net_mutex); /* Don't let anyone else find us. */ @@ -517,6 +527,7 @@ static void cleanup_net(struct work_struct *work) ops_free_list(ops, &net_exit_list); mutex_unlock(&net_mutex); + up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -543,8 +554,10 @@ static void cleanup_net(struct work_struct *work) */ void net_ns_barrier(void) { + down_write(&net_sem); mutex_lock(&net_mutex); mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL(net_ns_barrier); @@ -871,12 +884,12 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - mutex_lock(&net_mutex); + down_write(&net_sem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); init_net_initialized = true; - mutex_unlock(&net_mutex); + up_write(&net_sem); register_pernet_subsys(&net_ns_ops); @@ -1016,9 +1029,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&net_sem); error = register_pernet_operations(first_device, ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -1034,9 +1047,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&net_sem); unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -1062,11 +1075,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&net_sem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - mutex_unlock(&net_mutex); + up_write(&net_sem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -1082,11 +1095,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&net_sem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc290413a49d..257e7bbaffba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -454,11 +454,11 @@ static void rtnl_lock_unregistering_all(void) void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with cleanup_net() */ - mutex_lock(&net_mutex); + down_write(&net_sem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -- cgit v1.2.3-70-g09d2 From bcab1ddd9b2b105390712a9c1605bdb20a7f9a03 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:33 +0300 Subject: net: Move mutex_unlock() in cleanup_net() up net_sem protects from pernet_list changing, while ops_free_list() makes simple kfree(), and it can't race with other pernet_operations callbacks. So we may release net_mutex earlier then it was. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/net_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e89b2b7abd36..f8453c438798 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -522,11 +522,12 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); + mutex_unlock(&net_mutex); + /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - mutex_unlock(&net_mutex); up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this -- cgit v1.2.3-70-g09d2 From 447cd7a0d7d1e5b4486e99cce289654fec9951e3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:44 +0300 Subject: net: Allow pernet_operations to be executed in parallel This adds new pernet_operations::async flag to indicate operations, which ->init(), ->exit() and ->exit_batch() methods are allowed to be executed in parallel with the methods of any other pernet_operations. When there are only asynchronous pernet_operations in the system, net_mutex won't be taken for a net construction and destruction. Also, remove BUG_ON(mutex_is_locked()) from net_assign_generic() without replacing with the equivalent net_sem check, as there is one more lockdep assert below. v3: Add comment near net_mutex. Suggested-by: Eric W. Biederman Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/net/net_namespace.h | 6 ++++++ net/core/net_namespace.c | 30 ++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f306b2aa15a4..9158ec1ad06f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -313,6 +313,12 @@ struct pernet_operations { void (*exit_batch)(struct list_head *net_exit_list); unsigned int *id; size_t size; + /* + * Indicates above methods are allowed to be executed in parallel + * with methods of any other pernet_operations, i.e. they are not + * need synchronization via net_mutex. + */ + bool async; }; /* diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f8453c438798..2a01ff32d9c7 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -29,6 +29,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; +/* Used only if there are !async pernet_operations registered */ DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); @@ -41,8 +42,9 @@ struct net init_net = { EXPORT_SYMBOL(init_net); static bool init_net_initialized; +static unsigned nr_sync_pernet_ops; /* - * net_sem: protects: pernet_list, net_generic_ids, + * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops, * init_net_initialized and first_device pointer. */ DECLARE_RWSEM(net_sem); @@ -70,11 +72,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; - BUG_ON(!mutex_is_locked(&net_mutex)); BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_mutex)); + lockdep_is_held(&net_sem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; @@ -426,11 +427,14 @@ struct net *copy_net_ns(unsigned long flags, rv = down_read_killable(&net_sem); if (rv < 0) goto put_userns; - rv = mutex_lock_killable(&net_mutex); - if (rv < 0) - goto up_read; + if (nr_sync_pernet_ops) { + rv = mutex_lock_killable(&net_mutex); + if (rv < 0) + goto up_read; + } rv = setup_net(net, user_ns); - mutex_unlock(&net_mutex); + if (nr_sync_pernet_ops) + mutex_unlock(&net_mutex); up_read: up_read(&net_sem); if (rv < 0) { @@ -487,7 +491,8 @@ static void cleanup_net(struct work_struct *work) spin_unlock_irq(&cleanup_list_lock); down_read(&net_sem); - mutex_lock(&net_mutex); + if (nr_sync_pernet_ops) + mutex_lock(&net_mutex); /* Don't let anyone else find us. */ rtnl_lock(); @@ -522,7 +527,8 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); - mutex_unlock(&net_mutex); + if (nr_sync_pernet_ops) + mutex_unlock(&net_mutex); /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) @@ -994,6 +1000,9 @@ again: rcu_barrier(); if (ops->id) ida_remove(&net_generic_ids, *ops->id); + } else if (!ops->async) { + pr_info_once("Pernet operations %ps are sync.\n", ops); + nr_sync_pernet_ops++; } return error; @@ -1001,7 +1010,8 @@ again: static void unregister_pernet_operations(struct pernet_operations *ops) { - + if (!ops->async) + BUG_ON(nr_sync_pernet_ops-- == 0); __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) -- cgit v1.2.3-70-g09d2 From 3fc3b827f0c4397c74d4b8a8a06d71b903a4982f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:03 +0300 Subject: net: Convert net_ns_ops methods This patch starts to convert pernet_subsys, registered from pure initcalls. net_ns_ops::net_ns_net_init/net_ns_net_init, methods use only ida_simple_* functions, which are not need a synchronization. They are synchronized by idr subsystem. So, net_ns_ops methods are able to be executed in parallel with methods of other pernet operations. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/net_namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2a01ff32d9c7..e21c564c8c00 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -649,6 +649,7 @@ static __net_exit void net_ns_net_exit(struct net *net) static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, + .async = true, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { -- cgit v1.2.3-70-g09d2 From 93d230fe0762bf129fb4debc944af3e1c1e8f40e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:13 +0300 Subject: net: Convert sysctl_pernet_ops This patch starts to convert pernet_subsys, registered from core initcalls. Methods sysctl_net_init() and sysctl_net_exit() initialize net::sysctls table of a namespace. pernet_operations::init()/exit() methods from the rest of the list do not touch net::sysctls of strangers, so it's safe to execute sysctl_pernet_ops's methods in parallel with any other pernet_operations. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/sysctl_net.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 9aed6fe1bf1a..f424539829b7 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -89,6 +89,7 @@ static void __net_exit sysctl_net_exit(struct net *net) static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, + .async = true, }; static struct ctl_table_header *net_header; -- cgit v1.2.3-70-g09d2 From 9549929923738d14dc450f0427a4b98ac3e8aff2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:23 +0300 Subject: net: Convert netfilter_net_ops Methods netfilter_net_init() and netfilter_net_exit() initialize net::nf::hooks and change net-related proc directory of net. Another pernet_operations are not interested in forein net::nf::hooks or proc entries, so it's safe to make them executed in parallel with methods of other pernet operations. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/netfilter/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0f6b8172fb9a..d72cc786c7b7 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -629,6 +629,7 @@ static void __net_exit netfilter_net_exit(struct net *net) static struct pernet_operations netfilter_net_ops = { .init = netfilter_net_init, .exit = netfilter_net_exit, + .async = true, }; int __init netfilter_init(void) -- cgit v1.2.3-70-g09d2 From c9d8fb91351fe1514731b7f6d24d4decc0028978 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:31 +0300 Subject: net: Convert nf_log_net_ops The pernet_operations would have had a problem in parallel execution with others, if init_net had been able to released. But it's not, and the rest is safe for that. There is memory allocation, which nobody else interested in, and sysctl registration. So, we make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/netfilter/nf_log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index c2c1b16b7538..1ba3da51050d 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -577,6 +577,7 @@ static void __net_exit nf_log_net_exit(struct net *net) static struct pernet_operations nf_log_net_ops = { .init = nf_log_net_init, .exit = nf_log_net_exit, + .async = true, }; int __init netfilter_log_init(void) -- cgit v1.2.3-70-g09d2 From 604da74e4fc1c2afc50dc7a1850acfd9aff2b58d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:41 +0300 Subject: net: Convert net_inuse_ops net_inuse_ops methods expose statistics in /proc. No one from the rest of pernet_subsys or pernet_device lists touch net::core::inuse. So, it's safe to make net_inuse_ops async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 04e5e27c9b81..f2bf69b86c58 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3112,6 +3112,7 @@ static void __net_exit sock_inuse_exit_net(struct net *net) static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, + .async = true, }; static __init int net_inuse_init(void) -- cgit v1.2.3-70-g09d2 From ff291d005a988aaa7d73daf44c3d04585e9f0637 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:27:51 +0300 Subject: net: Convert net_defaults_ops net_defaults_ops introduce only net_defaults_init_net method, and it acts on net::core::sysctl_somaxconn, which is not interesting for the rest of pernet_subsys and pernet_device lists. Then, make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/net_namespace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e21c564c8c00..bcab9a938d6f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -340,6 +340,7 @@ static int __net_init net_defaults_init_net(struct net *net) static struct pernet_operations net_defaults_ops = { .init = net_defaults_init_net, + .async = true, }; static __init int net_defaults_init(void) -- cgit v1.2.3-70-g09d2 From 194b95d2166661f890dca5ef3b952c73622eb4e5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:00 +0300 Subject: net: Convert netlink_net_ops The methods of netlink_net_ops create and destroy "netlink" file, which are not interesting for foreigh pernet_operations. So, netlink_net_ops may safely be made async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3c8af14330b5..b3065908e146 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2723,6 +2723,7 @@ static void __init netlink_add_usersock_entry(void) static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, + .async = true, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) -- cgit v1.2.3-70-g09d2 From 46456675ec1b7f93dadd2b1b4b58d763c3ae9266 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:15 +0300 Subject: net: Convert rtnetlink_net_ops rtnetlink_net_init() and rtnetlink_net_exit() create and destroy netlink socket net::rtnl. The socket is used to send rtnl notification via rtnl_net_notifyid(). There is no a problem to create and destroy it in parallel with other pernet operations, as we link net in setup_net() after the socket is created, and destroy in cleanup_net() after net is unhashed from all the lists and there is no RCU references on it. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 257e7bbaffba..67f375cfb982 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4724,6 +4724,7 @@ static void __net_exit rtnetlink_net_exit(struct net *net) static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, + .async = true, }; void __init rtnetlink_init(void) -- cgit v1.2.3-70-g09d2 From 36b0068e6c9892aa4757d4fa08fd14fbba72b3b3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:44 +0300 Subject: net: Convert proto_net_ops This patch starts to convert pernet_subsys, registered from subsys initcalls. It seems safe to be executed in parallel with others, as it's only creates/destoyes proc entry, which nobody else is not interested in. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index f2bf69b86c58..e90d461748f0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3386,6 +3386,7 @@ static __net_exit void proto_exit_net(struct net *net) static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, + .async = true, }; static int __init proto_init(void) -- cgit v1.2.3-70-g09d2 From 88b8ffebdb4d0f4da4e9a8383c8478c32372b42b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:28:54 +0300 Subject: net: Convert pernet_subsys ops, registered via net_dev_init() There are: 1)dev_proc_ops and dev_mc_net_ops, which create and destroy pernet proc file and not interesting for another net namespaces; 2)netdev_net_ops, which creates pernet hashes, which are not touched by another pernet_operations. So, make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/dev.c | 1 + net/core/net-procfs.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index dda9d7b9a840..dc7506f00a66 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8833,6 +8833,7 @@ static void __net_exit netdev_exit(struct net *net) static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, + .async = true, }; static void __net_exit default_device_exit(struct net *net) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index e010bb800d7b..65b51e778782 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -349,6 +349,7 @@ static void __net_exit dev_proc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_proc_ops = { .init = dev_proc_net_init, .exit = dev_proc_net_exit, + .async = true, }; static int dev_mc_seq_show(struct seq_file *seq, void *v) @@ -405,6 +406,7 @@ static void __net_exit dev_mc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_mc_net_ops = { .init = dev_mc_net_init, .exit = dev_mc_net_exit, + .async = true, }; int __init dev_proc_init(void) -- cgit v1.2.3-70-g09d2 From 86b63418fd382b095fdac4408fd565aa5da4b036 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:03 +0300 Subject: net: Convert fib_* pernet_operations, registered via subsys_initcall Both of them create and initialize lists, which are not touched by another foreing pernet_operations. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/fib_notifier.c | 1 + net/core/fib_rules.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..5ace0705a3f9 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -171,6 +171,7 @@ static void __net_exit fib_notifier_net_exit(struct net *net) static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, + .async = true, }; static int __init fib_notifier_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 98e1066c3d55..cb071b8e8d17 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1030,6 +1030,7 @@ static void __net_exit fib_rules_net_exit(struct net *net) static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, + .async = true, }; static int __init fib_rules_init(void) -- cgit v1.2.3-70-g09d2 From 13da199c38ee7f33a1c42db62647118f9f9f527c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:13 +0300 Subject: net: Convert subsys_initcall() registered pernet_operations from net/sched psched_net_ops only creates and destroyes /proc entry, and safe to be executed in parallel with any foreigh pernet_operations. tcf_action_net_ops initializes and destructs tcf_action_net::egdev_ht, which is not touched by foreign pernet_operations. So, make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/sched/act_api.c | 1 + net/sched/sch_api.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eba6682727dd..4886ea4a7d6e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1454,6 +1454,7 @@ static struct pernet_operations tcf_action_net_ops = { .exit = tcf_action_net_exit, .id = &tcf_action_net_id, .size = sizeof(struct tcf_action_net), + .async = true, }; static int __init tc_action_init(void) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d512f49ee83c..27e672c12492 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2128,6 +2128,7 @@ static void __net_exit psched_net_exit(struct net *net) static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, + .async = true, }; static int __init pktsched_init(void) -- cgit v1.2.3-70-g09d2 From 83caf62c867bed2637d4f3713839b0c414d6d966 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:23 +0300 Subject: net: Convert genl_pernet_ops This pernet_operations create and destroy net::genl_sock. Foreign pernet_operations don't touch it. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 6f02499ef007..a6f63a5faee7 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1035,6 +1035,7 @@ static void __net_exit genl_pernet_exit(struct net *net) static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, + .async = true, }; static int __init genl_init(void) -- cgit v1.2.3-70-g09d2 From 6c0075d0f6ccf5be4527408830852106808ab2bb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:33 +0300 Subject: net: Convert wext_pernet_ops These pernet_operations initialize and purge net::wext_nlevents queue, and are not touched by foreign pernet_operations. Mark them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/wireless/wext-core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..bc7064486b15 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -390,6 +390,7 @@ static void __net_exit wext_pernet_exit(struct net *net) static struct pernet_operations wext_pernet_ops = { .init = wext_pernet_init, .exit = wext_pernet_exit, + .async = true, }; static int __init wireless_nlevent_init(void) -- cgit v1.2.3-70-g09d2 From 232cf06c611f57ccb8e321de3fd850f21215ede8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:42 +0300 Subject: net: Convert sysctl_core_ops These pernet_operations register and destroy sysctl directory, and it's not interesting for foreign pernet_operations. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f2d0462611c3..d714f65782b7 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -572,6 +572,7 @@ static __net_exit void sysctl_core_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, + .async = true, }; static __init int sysctl_core_init(void) -- cgit v1.2.3-70-g09d2 From f84c6821aa540342360067604ad156e3d53a67ed Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:29:52 +0300 Subject: net: Convert pernet_subsys, registered from inet_init() arp_net_ops just addr/removes /proc entry. devinet_ops allocates and frees duplicate of init_net tables and (un)registers sysctl entries. fib_net_ops allocates and frees pernet tables, creates/destroys netlink socket and (un)initializes /proc entries. Foreign pernet_operations do not touch them. ip_rt_proc_ops only modifies pernet /proc entries. xfrm_net_ops creates/destroys /proc entries, allocates/frees pernet statistics, hashes and tables, and (un)initializes sysctl files. These are not touched by foreigh pernet_operations xfrm4_net_ops allocates/frees private pernet memory, and configures sysctls. sysctl_route_ops creates/destroys sysctls. rt_genid_ops only initializes fields of just allocated net. ipv4_inetpeer_ops allocated/frees net private memory. igmp_net_ops just creates/destroys /proc files and socket, noone else interested in. tcp_sk_ops seems to be safe, because tcp_sk_init() does not depend on any other pernet_operations modifications. Iteration over hash table in inet_twsk_purge() is made under RCU lock, and it's safe to iterate the table this way. Removing from the table happen from inet_twsk_deschedule_put(), but this function is safe without any extern locks, as it's synchronized inside itself. There are many examples, it's used in different context. So, it's safe to leave tcp_sk_exit_batch() unlocked. tcp_net_metrics_ops is synchronized on tcp_metrics_lock and safe. udplite4_net_ops only creates/destroys pernet /proc file. icmp_sk_ops creates percpu sockets, not touched by foreign pernet_operations. ipmr_net_ops creates/destroys pernet fib tables, (un)registers fib rules and /proc files. This seem to be safe to execute in parallel with foreign pernet_operations. af_inet_ops just sets up default parameters of newly created net. ipv4_mib_ops creates and destroys pernet percpu statistics. raw_net_ops, tcp4_net_ops, udp4_net_ops, ping_v4_net_ops and ip_proc_ops only create/destroy pernet /proc files. ip4_frags_ops creates and destroys sysctl file. So, it's safe to make the pernet_operations async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 ++ net/ipv4/arp.c | 1 + net/ipv4/devinet.c | 1 + net/ipv4/fib_frontend.c | 1 + net/ipv4/icmp.c | 1 + net/ipv4/igmp.c | 1 + net/ipv4/ip_fragment.c | 1 + net/ipv4/ipmr.c | 1 + net/ipv4/ping.c | 1 + net/ipv4/proc.c | 1 + net/ipv4/raw.c | 1 + net/ipv4/route.c | 4 ++++ net/ipv4/tcp_ipv4.c | 2 ++ net/ipv4/tcp_metrics.c | 1 + net/ipv4/udp.c | 1 + net/ipv4/udplite.c | 1 + net/ipv4/xfrm4_policy.c | 1 + net/xfrm/xfrm_policy.c | 1 + 18 files changed, 23 insertions(+) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f98e2f0db841..e8c7fad8c329 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1735,6 +1735,7 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, + .async = true, }; static int __init init_ipv4_mibs(void) @@ -1788,6 +1789,7 @@ static __net_exit void inet_exit_net(struct net *net) static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, .exit = inet_exit_net, + .async = true, }; static int __init init_inet_pernet_ops(void) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f28f06c91ead..7dc9de8444a9 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1447,6 +1447,7 @@ static void __net_exit arp_net_exit(struct net *net) static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, + .async = true, }; static int __init arp_proc_init(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 40f001782c1b..5ae0d1f097ca 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2469,6 +2469,7 @@ static __net_exit void devinet_exit_net(struct net *net) static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, + .async = true, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index f05afaf3235c..ac71c3d496c0 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1362,6 +1362,7 @@ static void __net_exit fib_net_exit(struct net *net) static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, + .async = true, }; void __init ip_fib_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1617604c9284..cc56efa64d5c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1257,6 +1257,7 @@ fail: static struct pernet_operations __net_initdata icmp_sk_ops = { .init = icmp_sk_init, .exit = icmp_sk_exit, + .async = true, }; int __init icmp_init(void) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f2402581fef1..c2743763777e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -3028,6 +3028,7 @@ static void __net_exit igmp_net_exit(struct net *net) static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, + .async = true, }; #endif diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..5e843ae5e468 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -885,6 +885,7 @@ static void __net_exit ipv4_frags_exit_net(struct net *net) static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .exit = ipv4_frags_exit_net, + .async = true, }; void __init ipfrag_init(void) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b05689bbba31..7c7ac9d32e77 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3327,6 +3327,7 @@ static void __net_exit ipmr_net_exit(struct net *net) static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, + .async = true, }; int __init ip_mr_init(void) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b8f0db54b197..0164def9c808 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1204,6 +1204,7 @@ static void __net_exit ping_v4_proc_exit_net(struct net *net) static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, + .async = true, }; int __init ping_proc_init(void) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index dc5edc8f7564..fdabc70283b6 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -549,6 +549,7 @@ static __net_exit void ip_proc_exit_net(struct net *net) static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, + .async = true, }; int __init ip_misc_proc_init(void) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9b367fc48d7d..54648d20bf0f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1156,6 +1156,7 @@ static __net_exit void raw_exit_net(struct net *net) static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, + .async = true, }; int __init raw_proc_init(void) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cc1c1df1ba..9376ed69ffeb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -417,6 +417,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, + .async = true, }; static int __init ip_rt_proc_init(void) @@ -2994,6 +2995,7 @@ static __net_exit void sysctl_route_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, + .async = true, }; #endif @@ -3007,6 +3009,7 @@ static __net_init int rt_genid_init(struct net *net) static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, + .async = true, }; static int __net_init ipv4_inetpeer_init(struct net *net) @@ -3032,6 +3035,7 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net) static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, + .async = true, }; #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8ad397e285e..ac16795486ea 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2387,6 +2387,7 @@ static void __net_exit tcp4_proc_exit_net(struct net *net) static struct pernet_operations tcp4_net_ops = { .init = tcp4_proc_init_net, .exit = tcp4_proc_exit_net, + .async = true, }; int __init tcp4_proc_init(void) @@ -2573,6 +2574,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .init = tcp_sk_init, .exit = tcp_sk_exit, .exit_batch = tcp_sk_exit_batch, + .async = true, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03b51cdcc731..aa6fea9f3328 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1024,6 +1024,7 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, + .async = true, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bfaefe560b5c..ac5fac0e59b1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2757,6 +2757,7 @@ static void __net_exit udp4_proc_exit_net(struct net *net) static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, + .async = true, }; int __init udp4_proc_init(void) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index f96614e9b9a5..72f2c3806408 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -104,6 +104,7 @@ static void __net_exit udplite4_proc_exit_net(struct net *net) static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, + .async = true, }; static __init int udplite4_proc_init(void) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 05017e2c849c..753f526cf9db 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -365,6 +365,7 @@ static void __net_exit xfrm4_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, + .async = true, }; static void __init xfrm4_policy_init(void) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078132cf..77d9d1ab05ce 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2982,6 +2982,7 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, + .async = true, }; void __init xfrm_init(void) -- cgit v1.2.3-70-g09d2 From 167f7ac723e5b4ea22c44a0bd8e357bb76a68cd2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:00 +0300 Subject: net: Convert unix_net_ops These pernet_operations are just create and destroy /proc and sysctl entries, and are not touched by foreign pernet_operations. So, we are able to make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/unix/af_unix.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 723698416242..e3eb8806b3e4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2913,6 +2913,7 @@ static void __net_exit unix_net_exit(struct net *net) static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, + .async = true, }; static int __init af_unix_init(void) -- cgit v1.2.3-70-g09d2 From cb5e3400e78598e1eb872954516a02ba85926d84 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:08 +0300 Subject: net: Convert packet_net_ops These pernet_operations just create and destroy /proc entry, and another operations do not touch it. Also, nobody else are interested in foreign net::packet::sklist. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/packet/af_packet.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 616cb9c18f88..2c5a6fe5d749 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4557,6 +4557,7 @@ static void __net_exit packet_net_exit(struct net *net) static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, + .async = true, }; -- cgit v1.2.3-70-g09d2 From 22769a2a6e93a17d08e1cd7a2de2749088887b87 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:18 +0300 Subject: net: Convert ipv4_sysctl_ops These pernet_operations create and destroy sysctl, which are not touched by anybody else. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 93e172118a94..89683d868b37 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1219,6 +1219,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, + .async = true, }; static __init int sysctl_ipv4_init(void) -- cgit v1.2.3-70-g09d2 From 0bc9be67185ec90feedc971af6e7b84ab932703a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:27 +0300 Subject: net: Convert addrconf_ops These pernet_operations (un)register sysctl, which are not touched by anybody else. So, it's safe to make them async. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e1846b97ee69..8c17f8d8d5d9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6550,6 +6550,7 @@ static void __net_exit addrconf_exit_net(struct net *net) static struct pernet_operations addrconf_ops = { .init = addrconf_init_net, .exit = addrconf_exit_net, + .async = true, }; static struct rtnl_af_ops inet6_ops __read_mostly = { -- cgit v1.2.3-70-g09d2 From 2608e6b7adc8b07194b855e2102d6f1a277e3f03 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:42 +0300 Subject: net: Convert default_device_ops These pernet operations consist of exit() and exit_batch() methods. default_device_exit() moves not-local and virtual devices to init_net. There is nothing exciting, because this may happen in any time on a working system, and rtnl_lock() and synchronize_net() protect us from all cases of external dereference. The same for default_device_exit_batch(). Similar unregisteration may happen in any time on a system. Here several lists (like todo_list), which are accessed under rtnl_lock(). After rtnl_unlock() and netdev_run_todo() all the devices are flushed. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index dc7506f00a66..df5241c8eda1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8934,6 +8934,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, .exit_batch = default_device_exit_batch, + .async = true, }; /* -- cgit v1.2.3-70-g09d2 From 59a513587ac09359875d6074778f21a3c01754e0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:30:52 +0300 Subject: net: Convert diag_net_ops These pernet operations just create and destroy netlink socket. The socket is pernet and else operations don't touch it. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/core/sock_diag.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 146b50e30659..aee5642affd9 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -328,6 +328,7 @@ static void __net_exit diag_net_exit(struct net *net) static struct pernet_operations diag_net_ops = { .init = diag_net_init, .exit = diag_net_exit, + .async = true, }; static int __init sock_diag_init(void) -- cgit v1.2.3-70-g09d2 From b86b47a39598cdf17e0e826ebe1be21c798112cf Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:31:01 +0300 Subject: net: Convert netlink_tap_net_ops These pernet_operations init just allocated net memory, and they obviously can be executed in parallel in any others. v3: New Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b3065908e146..63cb55d3c2fd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,6 +253,7 @@ static struct pernet_operations netlink_tap_net_ops = { .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), + .async = true, }; static bool netlink_filter_tap(const struct sk_buff *skb) -- cgit v1.2.3-70-g09d2 From ff22b5bf78fe2a9f2a26950cc2ea180180bef47f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 13 Feb 2018 19:33:20 +0800 Subject: sctp: rename sctp_diag.c as diag.c Remove 'sctp_' prefix for diag file, to keep consistent with other files' names. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/Makefile | 2 + net/sctp/diag.c | 526 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/sctp/sctp_diag.c | 526 --------------------------------------------------- 3 files changed, 528 insertions(+), 526 deletions(-) create mode 100644 net/sctp/diag.c delete mode 100644 net/sctp/sctp_diag.c (limited to 'net') diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 6776582ec449..e845e4588535 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -15,6 +15,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ offload.o stream_sched.o stream_sched_prio.o \ stream_sched_rr.o stream_interleave.o +sctp_diag-y := diag.o + sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o sctp-$(CONFIG_PROC_FS) += proc.o sctp-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sctp/diag.c b/net/sctp/diag.c new file mode 100644 index 000000000000..a72a7d925d46 --- /dev/null +++ b/net/sctp/diag.c @@ -0,0 +1,526 @@ +#include +#include +#include +#include + +static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info); + +/* define some functions to make asoc/ep fill look clean */ +static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, + struct sock *sk, + struct sctp_association *asoc) +{ + union sctp_addr laddr, paddr; + struct dst_entry *dst; + struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer; + + laddr = list_entry(asoc->base.bind_addr.address_list.next, + struct sctp_sockaddr_entry, list)->a; + paddr = asoc->peer.primary_path->ipaddr; + dst = asoc->peer.primary_path->dst; + + r->idiag_family = sk->sk_family; + r->id.idiag_sport = htons(asoc->base.bind_addr.port); + r->id.idiag_dport = htons(asoc->peer.port); + r->id.idiag_if = dst ? dst->dev->ifindex : 0; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr; + *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr; + } else +#endif + { + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + + r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr; + r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr; + } + + r->idiag_state = asoc->state; + if (timer_pending(t3_rtx)) { + r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; + r->idiag_retrans = asoc->rtx_data_chunks; + r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); + } else { + r->idiag_timer = 0; + r->idiag_retrans = 0; + r->idiag_expires = 0; + } +} + +static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, + struct list_head *address_list) +{ + struct sctp_sockaddr_entry *laddr; + int addrlen = sizeof(struct sockaddr_storage); + int addrcnt = 0; + struct nlattr *attr; + void *info = NULL; + + list_for_each_entry_rcu(laddr, address_list, list) + addrcnt++; + + attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + list_for_each_entry_rcu(laddr, address_list, list) { + memcpy(info, &laddr->a, sizeof(laddr->a)); + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); + info += addrlen; + } + + return 0; +} + +static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, + struct sctp_association *asoc) +{ + int addrlen = sizeof(struct sockaddr_storage); + struct sctp_transport *from; + struct nlattr *attr; + void *info = NULL; + + attr = nla_reserve(skb, INET_DIAG_PEERS, + addrlen * asoc->peer.transport_count); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + list_for_each_entry(from, &asoc->peer.transport_addr_list, + transports) { + memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); + memset(info + sizeof(from->ipaddr), 0, + addrlen - sizeof(from->ipaddr)); + info += addrlen; + } + + return 0; +} + +/* sctp asoc/ep fill*/ +static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, + struct sk_buff *skb, + const struct inet_diag_req_v2 *req, + struct user_namespace *user_ns, + int portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh, + bool net_admin) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct list_head *addr_list; + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + int ext = req->idiag_ext; + struct sctp_infox infox; + void *info = NULL; + + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + BUG_ON(!sk_fullsock(sk)); + + if (asoc) { + inet_diag_msg_sctpasoc_fill(r, sk, asoc); + } else { + inet_diag_msg_common_fill(r, sk); + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + } + + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) + goto errout; + + if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { + u32 mem[SK_MEMINFO_VARS]; + int amt; + + if (asoc && asoc->ep->sndbuf_policy) + amt = asoc->sndbuf_used; + else + amt = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_WMEM_ALLOC] = amt; + if (asoc && asoc->ep->rcvbuf_policy) + amt = atomic_read(&asoc->rmem_alloc); + else + amt = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RMEM_ALLOC] = amt; + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + + if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) + goto errout; + } + + if (ext & (1 << (INET_DIAG_INFO - 1))) { + struct nlattr *attr; + + attr = nla_reserve_64bit(skb, INET_DIAG_INFO, + sizeof(struct sctp_info), + INET_DIAG_PAD); + if (!attr) + goto errout; + + info = nla_data(attr); + } + infox.sctpinfo = (struct sctp_info *)info; + infox.asoc = asoc; + sctp_diag_get_info(sk, r, &infox); + + addr_list = asoc ? &asoc->base.bind_addr.address_list + : &ep->base.bind_addr.address_list; + if (inet_diag_msg_sctpladdrs_fill(skb, addr_list)) + goto errout; + + if (asoc && (ext & (1 << (INET_DIAG_CONG - 1)))) + if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0) + goto errout; + + if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc)) + goto errout; + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* callback and param */ +struct sctp_comm_param { + struct sk_buff *skb; + struct netlink_callback *cb; + const struct inet_diag_req_v2 *r; + const struct nlmsghdr *nlh; + bool net_admin; +}; + +static size_t inet_assoc_attr_size(struct sctp_association *asoc) +{ + int addrlen = sizeof(struct sockaddr_storage); + int addrcnt = 0; + struct sctp_sockaddr_entry *laddr; + + list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, + list) + addrcnt++; + + return nla_total_size(sizeof(struct sctp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(4) /* INET_DIAG_MARK */ + + nla_total_size(addrlen * asoc->peer.transport_count) + + nla_total_size(addrlen * addrcnt) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + 64; +} + +static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) +{ + struct sctp_association *assoc = tsp->asoc; + struct sock *sk = tsp->asoc->base.sk; + struct sctp_comm_param *commp = p; + struct sk_buff *in_skb = commp->skb; + const struct inet_diag_req_v2 *req = commp->r; + const struct nlmsghdr *nlh = commp->nlh; + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + int err; + + err = sock_diag_check_cookie(sk, req->id.idiag_cookie); + if (err) + goto out; + + err = -ENOMEM; + rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); + if (!rep) + goto out; + + lock_sock(sk); + if (sk != assoc->base.sk) { + release_sock(sk); + sk = assoc->base.sk; + lock_sock(sk); + } + err = inet_sctp_diag_fill(sk, assoc, rep, req, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, nlh, + commp->net_admin); + release_sock(sk); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + return err; +} + +static int sctp_sock_dump(struct sctp_transport *tsp, void *p) +{ + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct sk_buff *skb = commp->skb; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc; + int err = 0; + + lock_sock(sk); + list_for_each_entry(assoc, &ep->asocs, asocs) { + if (cb->args[4] < cb->args[1]) + goto next; + + if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != htons(assoc->peer.port) && + r->id.idiag_dport) + goto next; + + if (!cb->args[3] && + inet_sctp_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh, + commp->net_admin) < 0) { + err = 1; + goto release; + } + cb->args[3] = 1; + + if (inet_sctp_diag_fill(sk, assoc, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, cb->nlh, + commp->net_admin) < 0) { + err = 1; + goto release; + } +next: + cb->args[4]++; + } + cb->args[1] = 0; + cb->args[3] = 0; + cb->args[4] = 0; +release: + release_sock(sk); + return err; +} + +static int sctp_sock_filter(struct sctp_transport *tsp, void *p) +{ + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc = + list_entry(ep->asocs.next, struct sctp_association, asocs); + + /* find the ep only once through the transports by this condition */ + if (tsp->asoc != assoc) + return 0; + + if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) + return 0; + + return 1; +} + +static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) +{ + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct sk_buff *skb = commp->skb; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct net *net = sock_net(skb->sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + + if (!net_eq(sock_net(sk), net)) + goto out; + + if (cb->args[4] < cb->args[1]) + goto next; + + if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs)) + goto next; + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + if (inet_sctp_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh, commp->net_admin) < 0) { + err = 2; + goto out; + } +next: + cb->args[4]++; +out: + return err; +} + +/* define the functions for sctp_diag_handler*/ +static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + struct sctp_infox *infox = (struct sctp_infox *)info; + + if (infox->asoc) { + r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); + r->idiag_wqueue = infox->asoc->sndbuf_used; + } else { + r->idiag_rqueue = sk->sk_ack_backlog; + r->idiag_wqueue = sk->sk_max_ack_backlog; + } + if (infox->sctpinfo) + sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); +} + +static int sctp_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) +{ + struct net *net = sock_net(in_skb->sk); + union sctp_addr laddr, paddr; + struct sctp_comm_param commp = { + .skb = in_skb, + .r = req, + .nlh = nlh, + .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN), + }; + + if (req->sdiag_family == AF_INET) { + laddr.v4.sin_port = req->id.idiag_sport; + laddr.v4.sin_addr.s_addr = req->id.idiag_src[0]; + laddr.v4.sin_family = AF_INET; + + paddr.v4.sin_port = req->id.idiag_dport; + paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0]; + paddr.v4.sin_family = AF_INET; + } else { + laddr.v6.sin6_port = req->id.idiag_sport; + memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, + sizeof(laddr.v6.sin6_addr)); + laddr.v6.sin6_family = AF_INET6; + + paddr.v6.sin6_port = req->id.idiag_dport; + memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, + sizeof(paddr.v6.sin6_addr)); + paddr.v6.sin6_family = AF_INET6; + } + + return sctp_transport_lookup_process(sctp_tsp_dump_one, + net, &laddr, &paddr, &commp); +} + +static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) +{ + u32 idiag_states = r->idiag_states; + struct net *net = sock_net(skb->sk); + struct sctp_comm_param commp = { + .skb = skb, + .cb = cb, + .r = r, + .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), + }; + int pos = cb->args[2]; + + /* eps hashtable dumps + * args: + * 0 : if it will traversal listen sock + * 1 : to record the sock pos of this time's traversal + * 4 : to work as a temporary variable to traversal list + */ + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN)) + goto skip; + if (sctp_for_each_endpoint(sctp_ep_dump, &commp)) + goto done; +skip: + cb->args[0] = 1; + cb->args[1] = 0; + cb->args[4] = 0; + } + + /* asocs by transport hashtable dump + * args: + * 1 : to record the assoc pos of this time's traversal + * 2 : to record the transport pos of this time's traversal + * 3 : to mark if we have dumped the ep info of the current asoc + * 4 : to work as a temporary variable to traversal list + * 5 : to save the sk we get from travelsing the tsp list. + */ + if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) + goto done; + + sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, + net, &pos, &commp); + cb->args[2] = pos; + +done: + cb->args[1] = cb->args[4]; + cb->args[4] = 0; +} + +static const struct inet_diag_handler sctp_diag_handler = { + .dump = sctp_diag_dump, + .dump_one = sctp_diag_dump_one, + .idiag_get_info = sctp_diag_get_info, + .idiag_type = IPPROTO_SCTP, + .idiag_info_size = sizeof(struct sctp_info), +}; + +static int __init sctp_diag_init(void) +{ + return inet_diag_register(&sctp_diag_handler); +} + +static void __exit sctp_diag_exit(void) +{ + inet_diag_unregister(&sctp_diag_handler); +} + +module_init(sctp_diag_init); +module_exit(sctp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132); diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c deleted file mode 100644 index a72a7d925d46..000000000000 --- a/net/sctp/sctp_diag.c +++ /dev/null @@ -1,526 +0,0 @@ -#include -#include -#include -#include - -static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *info); - -/* define some functions to make asoc/ep fill look clean */ -static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, - struct sock *sk, - struct sctp_association *asoc) -{ - union sctp_addr laddr, paddr; - struct dst_entry *dst; - struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer; - - laddr = list_entry(asoc->base.bind_addr.address_list.next, - struct sctp_sockaddr_entry, list)->a; - paddr = asoc->peer.primary_path->ipaddr; - dst = asoc->peer.primary_path->dst; - - r->idiag_family = sk->sk_family; - r->id.idiag_sport = htons(asoc->base.bind_addr.port); - r->id.idiag_dport = htons(asoc->peer.port); - r->id.idiag_if = dst ? dst->dev->ifindex : 0; - sock_diag_save_cookie(sk, r->id.idiag_cookie); - -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr; - *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr; - } else -#endif - { - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr; - r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr; - } - - r->idiag_state = asoc->state; - if (timer_pending(t3_rtx)) { - r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; - r->idiag_retrans = asoc->rtx_data_chunks; - r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies); - } else { - r->idiag_timer = 0; - r->idiag_retrans = 0; - r->idiag_expires = 0; - } -} - -static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, - struct list_head *address_list) -{ - struct sctp_sockaddr_entry *laddr; - int addrlen = sizeof(struct sockaddr_storage); - int addrcnt = 0; - struct nlattr *attr; - void *info = NULL; - - list_for_each_entry_rcu(laddr, address_list, list) - addrcnt++; - - attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); - if (!attr) - return -EMSGSIZE; - - info = nla_data(attr); - list_for_each_entry_rcu(laddr, address_list, list) { - memcpy(info, &laddr->a, sizeof(laddr->a)); - memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); - info += addrlen; - } - - return 0; -} - -static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, - struct sctp_association *asoc) -{ - int addrlen = sizeof(struct sockaddr_storage); - struct sctp_transport *from; - struct nlattr *attr; - void *info = NULL; - - attr = nla_reserve(skb, INET_DIAG_PEERS, - addrlen * asoc->peer.transport_count); - if (!attr) - return -EMSGSIZE; - - info = nla_data(attr); - list_for_each_entry(from, &asoc->peer.transport_addr_list, - transports) { - memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); - memset(info + sizeof(from->ipaddr), 0, - addrlen - sizeof(from->ipaddr)); - info += addrlen; - } - - return 0; -} - -/* sctp asoc/ep fill*/ -static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, - struct sk_buff *skb, - const struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - int portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh, - bool net_admin) -{ - struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct list_head *addr_list; - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - int ext = req->idiag_ext; - struct sctp_infox infox; - void *info = NULL; - - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(!sk_fullsock(sk)); - - if (asoc) { - inet_diag_msg_sctpasoc_fill(r, sk, asoc); - } else { - inet_diag_msg_common_fill(r, sk); - r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; - } - - if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin)) - goto errout; - - if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { - u32 mem[SK_MEMINFO_VARS]; - int amt; - - if (asoc && asoc->ep->sndbuf_policy) - amt = asoc->sndbuf_used; - else - amt = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_WMEM_ALLOC] = amt; - if (asoc && asoc->ep->rcvbuf_policy) - amt = atomic_read(&asoc->rmem_alloc); - else - amt = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RMEM_ALLOC] = amt; - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; - mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); - - if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) - goto errout; - } - - if (ext & (1 << (INET_DIAG_INFO - 1))) { - struct nlattr *attr; - - attr = nla_reserve_64bit(skb, INET_DIAG_INFO, - sizeof(struct sctp_info), - INET_DIAG_PAD); - if (!attr) - goto errout; - - info = nla_data(attr); - } - infox.sctpinfo = (struct sctp_info *)info; - infox.asoc = asoc; - sctp_diag_get_info(sk, r, &infox); - - addr_list = asoc ? &asoc->base.bind_addr.address_list - : &ep->base.bind_addr.address_list; - if (inet_diag_msg_sctpladdrs_fill(skb, addr_list)) - goto errout; - - if (asoc && (ext & (1 << (INET_DIAG_CONG - 1)))) - if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0) - goto errout; - - if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc)) - goto errout; - - nlmsg_end(skb, nlh); - return 0; - -errout: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -/* callback and param */ -struct sctp_comm_param { - struct sk_buff *skb; - struct netlink_callback *cb; - const struct inet_diag_req_v2 *r; - const struct nlmsghdr *nlh; - bool net_admin; -}; - -static size_t inet_assoc_attr_size(struct sctp_association *asoc) -{ - int addrlen = sizeof(struct sockaddr_storage); - int addrcnt = 0; - struct sctp_sockaddr_entry *laddr; - - list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, - list) - addrcnt++; - - return nla_total_size(sizeof(struct sctp_info)) - + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ - + nla_total_size(1) /* INET_DIAG_TOS */ - + nla_total_size(1) /* INET_DIAG_TCLASS */ - + nla_total_size(4) /* INET_DIAG_MARK */ - + nla_total_size(addrlen * asoc->peer.transport_count) - + nla_total_size(addrlen * addrcnt) - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + 64; -} - -static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) -{ - struct sctp_association *assoc = tsp->asoc; - struct sock *sk = tsp->asoc->base.sk; - struct sctp_comm_param *commp = p; - struct sk_buff *in_skb = commp->skb; - const struct inet_diag_req_v2 *req = commp->r; - const struct nlmsghdr *nlh = commp->nlh; - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - int err; - - err = sock_diag_check_cookie(sk, req->id.idiag_cookie); - if (err) - goto out; - - err = -ENOMEM; - rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); - if (!rep) - goto out; - - lock_sock(sk); - if (sk != assoc->base.sk) { - release_sock(sk); - sk = assoc->base.sk; - lock_sock(sk); - } - err = inet_sctp_diag_fill(sk, assoc, rep, req, - sk_user_ns(NETLINK_CB(in_skb).sk), - NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, 0, nlh, - commp->net_admin); - release_sock(sk); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - kfree_skb(rep); - goto out; - } - - err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, - MSG_DONTWAIT); - if (err > 0) - err = 0; -out: - return err; -} - -static int sctp_sock_dump(struct sctp_transport *tsp, void *p) -{ - struct sctp_endpoint *ep = tsp->asoc->ep; - struct sctp_comm_param *commp = p; - struct sock *sk = ep->base.sk; - struct sk_buff *skb = commp->skb; - struct netlink_callback *cb = commp->cb; - const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc; - int err = 0; - - lock_sock(sk); - list_for_each_entry(assoc, &ep->asocs, asocs) { - if (cb->args[4] < cb->args[1]) - goto next; - - if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) && - r->id.idiag_sport) - goto next; - if (r->id.idiag_dport != htons(assoc->peer.port) && - r->id.idiag_dport) - goto next; - - if (!cb->args[3] && - inet_sctp_diag_fill(sk, NULL, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->nlh, - commp->net_admin) < 0) { - err = 1; - goto release; - } - cb->args[3] = 1; - - if (inet_sctp_diag_fill(sk, assoc, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, 0, cb->nlh, - commp->net_admin) < 0) { - err = 1; - goto release; - } -next: - cb->args[4]++; - } - cb->args[1] = 0; - cb->args[3] = 0; - cb->args[4] = 0; -release: - release_sock(sk); - return err; -} - -static int sctp_sock_filter(struct sctp_transport *tsp, void *p) -{ - struct sctp_endpoint *ep = tsp->asoc->ep; - struct sctp_comm_param *commp = p; - struct sock *sk = ep->base.sk; - const struct inet_diag_req_v2 *r = commp->r; - struct sctp_association *assoc = - list_entry(ep->asocs.next, struct sctp_association, asocs); - - /* find the ep only once through the transports by this condition */ - if (tsp->asoc != assoc) - return 0; - - if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) - return 0; - - return 1; -} - -static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) -{ - struct sctp_comm_param *commp = p; - struct sock *sk = ep->base.sk; - struct sk_buff *skb = commp->skb; - struct netlink_callback *cb = commp->cb; - const struct inet_diag_req_v2 *r = commp->r; - struct net *net = sock_net(skb->sk); - struct inet_sock *inet = inet_sk(sk); - int err = 0; - - if (!net_eq(sock_net(sk), net)) - goto out; - - if (cb->args[4] < cb->args[1]) - goto next; - - if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs)) - goto next; - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next; - - if (r->id.idiag_dport != inet->inet_dport && - r->id.idiag_dport) - goto next; - - if (inet_sctp_diag_fill(sk, NULL, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - cb->nlh, commp->net_admin) < 0) { - err = 2; - goto out; - } -next: - cb->args[4]++; -out: - return err; -} - -/* define the functions for sctp_diag_handler*/ -static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *info) -{ - struct sctp_infox *infox = (struct sctp_infox *)info; - - if (infox->asoc) { - r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); - r->idiag_wqueue = infox->asoc->sndbuf_used; - } else { - r->idiag_rqueue = sk->sk_ack_backlog; - r->idiag_wqueue = sk->sk_max_ack_backlog; - } - if (infox->sctpinfo) - sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); -} - -static int sctp_diag_dump_one(struct sk_buff *in_skb, - const struct nlmsghdr *nlh, - const struct inet_diag_req_v2 *req) -{ - struct net *net = sock_net(in_skb->sk); - union sctp_addr laddr, paddr; - struct sctp_comm_param commp = { - .skb = in_skb, - .r = req, - .nlh = nlh, - .net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN), - }; - - if (req->sdiag_family == AF_INET) { - laddr.v4.sin_port = req->id.idiag_sport; - laddr.v4.sin_addr.s_addr = req->id.idiag_src[0]; - laddr.v4.sin_family = AF_INET; - - paddr.v4.sin_port = req->id.idiag_dport; - paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0]; - paddr.v4.sin_family = AF_INET; - } else { - laddr.v6.sin6_port = req->id.idiag_sport; - memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, - sizeof(laddr.v6.sin6_addr)); - laddr.v6.sin6_family = AF_INET6; - - paddr.v6.sin6_port = req->id.idiag_dport; - memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, - sizeof(paddr.v6.sin6_addr)); - paddr.v6.sin6_family = AF_INET6; - } - - return sctp_transport_lookup_process(sctp_tsp_dump_one, - net, &laddr, &paddr, &commp); -} - -static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, struct nlattr *bc) -{ - u32 idiag_states = r->idiag_states; - struct net *net = sock_net(skb->sk); - struct sctp_comm_param commp = { - .skb = skb, - .cb = cb, - .r = r, - .net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN), - }; - int pos = cb->args[2]; - - /* eps hashtable dumps - * args: - * 0 : if it will traversal listen sock - * 1 : to record the sock pos of this time's traversal - * 4 : to work as a temporary variable to traversal list - */ - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN)) - goto skip; - if (sctp_for_each_endpoint(sctp_ep_dump, &commp)) - goto done; -skip: - cb->args[0] = 1; - cb->args[1] = 0; - cb->args[4] = 0; - } - - /* asocs by transport hashtable dump - * args: - * 1 : to record the assoc pos of this time's traversal - * 2 : to record the transport pos of this time's traversal - * 3 : to mark if we have dumped the ep info of the current asoc - * 4 : to work as a temporary variable to traversal list - * 5 : to save the sk we get from travelsing the tsp list. - */ - if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) - goto done; - - sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, - net, &pos, &commp); - cb->args[2] = pos; - -done: - cb->args[1] = cb->args[4]; - cb->args[4] = 0; -} - -static const struct inet_diag_handler sctp_diag_handler = { - .dump = sctp_diag_dump, - .dump_one = sctp_diag_dump_one, - .idiag_get_info = sctp_diag_get_info, - .idiag_type = IPPROTO_SCTP, - .idiag_info_size = sizeof(struct sctp_info), -}; - -static int __init sctp_diag_init(void) -{ - return inet_diag_register(&sctp_diag_handler); -} - -static void __exit sctp_diag_exit(void) -{ - inet_diag_unregister(&sctp_diag_handler); -} - -module_init(sctp_diag_init); -module_exit(sctp_diag_exit); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132); -- cgit v1.2.3-70-g09d2 From 6f68dc993afce0b580b9b1a9edadcd4b178cbb06 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 13 Feb 2018 19:33:21 +0800 Subject: sctp: add file comments in diag.c This patch is to add the missing file comments for sctp diag file. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/diag.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'net') diff --git a/net/sctp/diag.c b/net/sctp/diag.c index a72a7d925d46..078f01a8d582 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -1,3 +1,34 @@ +/* SCTP kernel implementation + * (C) Copyright Red Hat Inc. 2017 + * + * This file is part of the SCTP kernel implementation + * + * These functions implement sctp diag support. + * + * This SCTP implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This SCTP implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, see + * . + * + * Please send any bug reports or fixes you make to the + * email addresched(es): + * lksctp developers + * + * Written or modified by: + * Xin Long + */ + #include #include #include -- cgit v1.2.3-70-g09d2 From 0d876f2c6de5b1b00c2fd38bb4f4692e720207b1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:11:34 -0800 Subject: net/ipv4: Simplify fib_select_path If flow oif is set and it is not an l3mdev, then fib_select_path can jump to the source address check. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c586597da20d..0b8c0a719f3e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1760,13 +1760,11 @@ void fib_select_multipath(struct fib_result *res, int hash) void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { - bool oif_check; - - oif_check = (fl4->flowi4_oif == 0 || - fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); + if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) + goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi->fib_nhs > 1 && oif_check) { + if (res->fi->fib_nhs > 1) { int h = fib_multipath_hash(res->fi, fl4, skb); fib_select_multipath(res, h); @@ -1775,9 +1773,10 @@ void fib_select_path(struct net *net, struct fib_result *res, #endif if (!res->prefixlen && res->table->tb_num_default > 1 && - res->type == RTN_UNICAST && oif_check) + res->type == RTN_UNICAST) fib_select_default(fl4, res); +check_saddr: if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, *res); } -- cgit v1.2.3-70-g09d2 From 8c2ceabe99e04005cadba739856eed6953a8a3af Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:12:12 -0800 Subject: net/ipv4: Unexport fib_multipath_hash and fib_select_path Do not export fib_multipath_hash or fib_select_path; both are only used by core ipv4 code. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 1 - net/ipv4/route.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0b8c0a719f3e..2b883b58354f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,4 +1780,3 @@ check_saddr: if (!fl4->saddr) fl4->saddr = FIB_RES_PREFSRC(net, *res); } -EXPORT_SYMBOL_GPL(fib_select_path); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9376ed69ffeb..b0ef4cc3e875 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1847,7 +1847,6 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, return mhash >> 1; } -EXPORT_SYMBOL_GPL(fib_multipath_hash); #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, -- cgit v1.2.3-70-g09d2 From ee6f4a5ad806a13b6b26b6699bb5562c08148280 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 11 Feb 2018 22:50:46 -0500 Subject: ieee802154: 6lowpan: set IFF_NO_QUEUE This patch sets the IFF_NO_QUEUE for IEEE 802.15.4 6lowpan interfaces. As commit 24dcbf662205 ("6lowpan: Don't set IFF_NO_QUEUE") removes it for "reasons" from the bluetooth 6lowpan subsystem. In IEEE 802.15.4 the lower interface deals with one qdisc for the real hardware, 6LoWPAN does the protocol adaption only and no second queuing on top. Signed-off-by: Alexander Aring Signed-off-by: Stefan Schmidt --- net/ieee802154/6lowpan/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 974765b7d92a..e4f305320519 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -104,6 +104,7 @@ static void lowpan_setup(struct net_device *ldev) /* We need an ipv6hdr as minimum len when calling xmit */ ldev->hard_header_len = sizeof(struct ipv6hdr); ldev->flags = IFF_BROADCAST | IFF_MULTICAST; + ldev->priv_flags |= IFF_NO_QUEUE; ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; -- cgit v1.2.3-70-g09d2 From 330c7272c40e965b8ab510d1022acd6e6a32e9c8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:52:00 -0800 Subject: net: Make dn_ptr depend on CONFIG_DECNET Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5eef6c8e2741..d2ef35e00626 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1800,7 +1800,9 @@ struct net_device { #endif void *atalk_ptr; struct in_device __rcu *ip_ptr; +#if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; +#endif struct inet6_dev __rcu *ip6_ptr; void *ax25_ptr; struct wireless_dev *ieee80211_ptr; diff --git a/net/core/dev.c b/net/core/dev.c index df5241c8eda1..4bd4ad7ffda4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8134,8 +8134,9 @@ void netdev_run_todo(void) BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); +#if IS_ENABLED(CONFIG_DECNET) WARN_ON(dev->dn_ptr); - +#endif if (dev->priv_destructor) dev->priv_destructor(dev); if (dev->needs_free_netdev) -- cgit v1.2.3-70-g09d2 From e92bad5034922776bd5e64e3db739d7607eb7e29 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:52:03 -0800 Subject: net: Remove atalk header from socket.c Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/socket.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index fac8246a8ae8..d83e804d5e65 100644 --- a/net/socket.c +++ b/net/socket.c @@ -104,7 +104,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From e0f9759f530bf789e984961dce79f525b151ecf3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Feb 2018 06:14:12 -0800 Subject: tcp: try to keep packet if SYN_RCV race is lost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 배석진 reported that in some situations, packets for a given 5-tuple end up being processed by different CPUS. This involves RPS, and fragmentation. 배석진 is seeing packet drops when a SYN_RECV request socket is moved into ESTABLISH state. Other states are protected by socket lock. This is caused by a CPU losing the race, and simply not caring enough. Since this seems to occur frequently, we can do better and perform a second lookup. Note that all needed memory barriers are already in the existing code, thanks to the spin_lock()/spin_unlock() pair in inet_ehash_insert() and reqsk_put(). The second lookup must find the new socket, unless it has already been accepted and closed by another cpu. Note that the fragmentation could be avoided in the first place by use of a correct TCP MSS option in the SYN{ACK} packet, but this does not mean we can not be more robust. Many thanks to 배석진 for a very detailed analysis. Reported-by: 배석진 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 3 ++- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_ipv4.c | 13 ++++++++++++- net/ipv4/tcp_minisocks.c | 3 ++- net/ipv6/tcp_ipv6.c | 13 ++++++++++++- 5 files changed, 31 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index e3fc667f9ac2..92b06c6e7732 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -374,7 +374,8 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, bool fastopen); + struct request_sock *req, bool fastopen, + bool *lost_race); int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 575d3c1fb6e8..a6b48f6253e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5870,10 +5870,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { + bool req_stolen; + WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true)) + if (!tcp_check_req(sk, skb, req, true, &req_stolen)) goto discard; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac16795486ea..f3e52bc98980 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1672,6 +1672,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); + bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; @@ -1694,10 +1695,20 @@ process: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } if (!nsk) { reqsk_put(req); + if (req_stolen) { + /* Another cpu got exclusive access to req + * and created a full blown socket. + * Try to feed this packet to this socket + * instead of discarding it. + */ + tcp_v4_restore_cb(skb); + sock_put(sk); + goto lookup; + } goto discard_and_relse; } if (nsk == sk) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a8384b0c11f8..e7e36433cdb5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -578,7 +578,7 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen) + bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; @@ -785,6 +785,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); + *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 412139f4eccd..883df0ad5bfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1451,6 +1451,7 @@ process: if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); + bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; @@ -1470,10 +1471,20 @@ process: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } if (!nsk) { reqsk_put(req); + if (req_stolen) { + /* Another cpu got exclusive access to req + * and created a full blown socket. + * Try to feed this packet to this socket + * instead of discarding it. + */ + tcp_v6_restore_cb(skb); + sock_put(sk); + goto lookup; + } goto discard_and_relse; } if (nsk == sk) { -- cgit v1.2.3-70-g09d2 From 0336369d3a4d65c9332476b618ff3bb9b41045e1 Mon Sep 17 00:00:00 2001 From: Brandon Streiff Date: Wed, 14 Feb 2018 01:07:48 +0100 Subject: net: dsa: forward hardware timestamping ioctls to switch driver This patch adds support to the dsa slave network device so that switch drivers can implement the SIOC[GS]HWTSTAMP ioctls and the ethtool timestamp-info interface. Signed-off-by: Brandon Streiff Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 15 +++++++++++++++ net/dsa/slave.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6cb602dd970c..4c0df83dddaf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -367,6 +368,12 @@ struct dsa_switch_ops { int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); + /* + * ethtool timestamp info + */ + int (*get_ts_info)(struct dsa_switch *ds, int port, + struct ethtool_ts_info *ts); + /* * Suspend and resume */ @@ -469,6 +476,14 @@ struct dsa_switch_ops { int port, struct net_device *br); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index, int port, struct net_device *br); + + /* + * PTP functionality + */ + int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, + struct ifreq *ifr); + int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, + struct ifreq *ifr); }; struct dsa_switch_driver { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index f52307296de4..935d93f0d36c 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -255,6 +255,22 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + int port = p->dp->index; + + /* Pass through to switch driver if it supports timestamping */ + switch (cmd) { + case SIOCGHWTSTAMP: + if (ds->ops->port_hwtstamp_get) + return ds->ops->port_hwtstamp_get(ds, port, ifr); + break; + case SIOCSHWTSTAMP: + if (ds->ops->port_hwtstamp_set) + return ds->ops->port_hwtstamp_set(ds, port, ifr); + break; + } + if (!dev->phydev) return -ENODEV; @@ -918,6 +934,18 @@ static int dsa_slave_set_rxnfc(struct net_device *dev, return ds->ops->set_rxnfc(ds, dp->index, nfc); } +static int dsa_slave_get_ts_info(struct net_device *dev, + struct ethtool_ts_info *ts) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->dp->ds; + + if (!ds->ops->get_ts_info) + return -EOPNOTSUPP; + + return ds->ops->get_ts_info(ds, p->dp->index, ts); +} + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_drvinfo = dsa_slave_get_drvinfo, .get_regs_len = dsa_slave_get_regs_len, @@ -938,6 +966,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .set_link_ksettings = phy_ethtool_set_link_ksettings, .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, + .get_ts_info = dsa_slave_get_ts_info, }; /* legacy way, bypassing the bridge *****************************************/ -- cgit v1.2.3-70-g09d2 From 90af1059c52c0031f3bfd8279c9ede153ca83275 Mon Sep 17 00:00:00 2001 From: Brandon Streiff Date: Wed, 14 Feb 2018 01:07:49 +0100 Subject: net: dsa: forward timestamping callbacks to switch drivers Forward the rx/tx timestamp machinery from the dsa infrastructure to the switch driver. On the rx side, defer delivery of skbs until we have an rx timestamp. This mimicks the behavior of skb_defer_rx_timestamp. On the tx side, identify PTP packets, clone them, and pass them to the underlying switch driver before we transmit. This mimicks the behavior of skb_tx_timestamp. Adjusted txstamp API to keep the allocation and freeing of the clone in the same central function by Richard Cochran Signed-off-by: Brandon Streiff Signed-off-by: Richard Cochran Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ net/dsa/dsa.c | 36 ++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4c0df83dddaf..0ad17b63684d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -102,6 +102,7 @@ struct dsa_platform_data { }; struct packet_type; +struct dsa_switch; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -484,6 +485,10 @@ struct dsa_switch_ops { struct ifreq *ifr); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); + bool (*port_txtstamp)(struct dsa_switch *ds, int port, + struct sk_buff *clone, unsigned int type); + bool (*port_rxtstamp)(struct dsa_switch *ds, int port, + struct sk_buff *skb, unsigned int type); }; struct dsa_switch_driver { diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6a9d0f50fbee..e63c554e0623 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -122,6 +123,38 @@ struct net_device *dsa_dev_to_net_device(struct device *dev) } EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); +/* Determine if we should defer delivery of skb until we have a rx timestamp. + * + * Called from dsa_switch_rcv. For now, this will only work if tagging is + * enabled on the switch. Normally the MAC driver would retrieve the hardware + * timestamp when it reads the packet out of the hardware. However in a DSA + * switch, the DSA driver owning the interface to which the packet is + * delivered is never notified unless we do so here. + */ +static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ + struct dsa_switch *ds = p->dp->ds; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) + return false; + + if (likely(ds->ops->port_rxtstamp)) + return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); + + return false; +} + static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { @@ -157,6 +190,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, s->rx_bytes += skb->len; u64_stats_update_end(&s->syncp); + if (dsa_skb_defer_rx_timestamp(p, skb)) + return 0; + netif_receive_skb(skb); return 0; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 935d93f0d36c..3376dad6dcfd 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "dsa_priv.h" @@ -401,6 +402,30 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, return NETDEV_TX_OK; } +static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ + struct dsa_switch *ds = p->dp->ds; + struct sk_buff *clone; + unsigned int type; + + type = ptp_classify_raw(skb); + if (type == PTP_CLASS_NONE) + return; + + if (!ds->ops->port_txtstamp) + return; + + clone = skb_clone_sk(skb); + if (!clone) + return; + + if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) + return; + + kfree_skb(clone); +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -413,6 +438,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) s->tx_bytes += skb->len; u64_stats_update_end(&s->syncp); + /* Identify PTP protocol packets, clone them, and pass them to the + * switch driver + */ + dsa_skb_tx_timestamp(p, skb); + /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ -- cgit v1.2.3-70-g09d2 From 9942895b5ee4b0db53f32fbcb4a51360607aac1b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 20:32:04 -0800 Subject: net: Move ipv4 set_lwt_redirect helper to lwtunnel IPv4 uses set_lwt_redirect to set the lwtunnel redirect functions as needed. Move it to lwtunnel.h as lwtunnel_set_redirect and change IPv6 to also use it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 15 +++++++++++++++ net/ipv4/route.c | 17 ++--------------- net/ipv6/route.c | 9 +-------- 3 files changed, 18 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index d747ef975cd8..33fd9ba7e0e5 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -127,6 +127,17 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); +static inline void lwtunnel_set_redirect(struct dst_entry *dst) +{ + if (lwtunnel_output_redirect(dst->lwtstate)) { + dst->lwtstate->orig_output = dst->output; + dst->output = lwtunnel_output; + } + if (lwtunnel_input_redirect(dst->lwtstate)) { + dst->lwtstate->orig_input = dst->input; + dst->input = lwtunnel_input; + } +} #else static inline void lwtstate_free(struct lwtunnel_state *lws) @@ -158,6 +169,10 @@ static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline void lwtunnel_set_redirect(struct dst_entry *dst) +{ +} + static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b0ef4cc3e875..6ce623e3e2ab 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1645,19 +1645,6 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) spin_unlock_bh(&fnhe_lock); } -static void set_lwt_redirect(struct rtable *rth) -{ - if (lwtunnel_output_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_output = rth->dst.output; - rth->dst.output = lwtunnel_output; - } - - if (lwtunnel_input_redirect(rth->dst.lwtstate)) { - rth->dst.lwtstate->orig_input = rth->dst.input; - rth->dst.input = lwtunnel_input; - } -} - /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, @@ -1748,7 +1735,7 @@ rt_cache: rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: err = 0; @@ -2267,7 +2254,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); - set_lwt_redirect(rth); + lwtunnel_set_redirect(&rth->dst); return rth; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9dcfadddd800..f0baae26db8f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2671,14 +2671,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output; - } - if (lwtunnel_input_redirect(rt->dst.lwtstate)) { - rt->dst.lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input; - } + lwtunnel_set_redirect(&rt->dst); } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3-70-g09d2 From 37c64cf63ba1f9c071b37a2129ae9860fd423d6c Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 14 Feb 2018 13:34:39 +0100 Subject: tipc: apply bearer link tolerance on running links Currently, the default link tolerance set in struct tipc_bearer only has effect on links going up after that moment. I.e., a user has to reset all the node's links across that bearer to have the new value applied. This is too limiting and disturbing on a running cluster to be useful. We now change this so that also already existing links are updated dynamically, without any need for a reset, when the bearer value is changed. We leverage the already existing per-link functionality for this to achieve the wanted effect. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 8 +++++--- net/tipc/link.c | 3 ++- net/tipc/node.c | 24 ++++++++++++++++++++++++ net/tipc/node.h | 1 + 4 files changed, 32 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index c8001471da6c..83d284feab1a 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -946,11 +946,11 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) { - int err; - char *name; struct tipc_bearer *b; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); + char *name; + int err; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; @@ -982,8 +982,10 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) return err; } - if (props[TIPC_NLA_PROP_TOL]) + if (props[TIPC_NLA_PROP_TOL]) { b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + tipc_node_apply_tolerance(net, b); + } if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) diff --git a/net/tipc/link.c b/net/tipc/link.c index 2d6b2aed30e0..3c230466804d 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2126,7 +2126,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); + if (link_is_up(l)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } void tipc_link_set_prio(struct tipc_link *l, u32 prio, diff --git a/net/tipc/node.c b/net/tipc/node.c index 9036d8756e73..389193d7cf67 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1618,6 +1618,30 @@ discard: kfree_skb(skb); } +void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b) +{ + struct tipc_net *tn = tipc_net(net); + int bearer_id = b->identity; + struct sk_buff_head xmitq; + struct tipc_link_entry *e; + struct tipc_node *n; + + __skb_queue_head_init(&xmitq); + + rcu_read_lock(); + + list_for_each_entry_rcu(n, &tn->node_list, list) { + tipc_node_write_lock(n); + e = &n->links[bearer_id]; + if (e->link) + tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); + tipc_node_write_unlock(n); + tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr); + } + + rcu_read_unlock(); +} + int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); diff --git a/net/tipc/node.h b/net/tipc/node.h index acd58d23a70e..4ce5e3a185c0 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -65,6 +65,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); +void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, -- cgit v1.2.3-70-g09d2 From dff8baa261174de689a44572d0ea182d7aa70598 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 14 Feb 2018 09:22:42 -0800 Subject: kcm: Call strp_stop before strp_done in kcm_attach In kcm_attach strp_done is called when sk_user_data is already set to fail the attach. strp_done needs the strp to be stopped and warns if it isn't. Call strp_stop in this case to eliminate the warning message. Reported-by: syzbot+88dfb55e4c8b770d86e3@syzkaller.appspotmail.com Fixes: e5571240236c5652f ("kcm: Check if sk_user_data already set in kcm_attach" Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/kcm/kcmsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index f297d53a11aa..435594648dac 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1417,6 +1417,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock, */ if (csk->sk_user_data) { write_unlock_bh(&csk->sk_callback_lock); + strp_stop(&psock->strp); strp_done(&psock->strp); kmem_cache_free(kcm_psockp, psock); return -EALREADY; -- cgit v1.2.3-70-g09d2 From d8d211a2a0c37755a8660dc69f97b7c70bf210b1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Feb 2018 16:39:56 +0300 Subject: net: Make extern and export get_net_ns() This function will be used to obtain net of tun device. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/socket.h | 2 ++ net/socket.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/socket.h b/include/linux/socket.h index 9286a5a8c60c..1ce1f768a58c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -353,4 +353,6 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen unsigned int flags, struct timespec *timeout); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags); + +extern struct ns_common *get_net_ns(struct ns_common *ns); #endif /* _LINUX_SOCKET_H */ diff --git a/net/socket.c b/net/socket.c index d83e804d5e65..ab58e57c09ca 100644 --- a/net/socket.c +++ b/net/socket.c @@ -990,10 +990,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */ -static struct ns_common *get_net_ns(struct ns_common *ns) +struct ns_common *get_net_ns(struct ns_common *ns) { return &get_net(container_of(ns, struct net, ns))->ns; } +EXPORT_SYMBOL_GPL(get_net_ns); static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { -- cgit v1.2.3-70-g09d2 From 68e813aa43071377b698c662bc0214f2a833bcbb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 14 Feb 2018 14:24:28 -0800 Subject: net/ipv4: Remove fib table id from rtable Remove rt_table_id from rtable. It was added for getroute to return the table id that was hit in the lookup. With the changes for fibmatch the table id can be extracted from the fib_info returned in the fib_result so it no longer needs to be in rtable directly. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 - include/net/route.h | 2 -- net/ipv4/route.c | 9 +-------- net/ipv4/xfrm4_policy.c | 1 - 4 files changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 139c61c8244a..239c78c53e58 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -736,7 +736,6 @@ static int vrf_rtable_create(struct net_device *dev) return -ENOMEM; rth->dst.output = vrf_output; - rth->rt_table_id = vrf->tb_id; rcu_assign_pointer(vrf->rth, rth); diff --git a/include/net/route.h b/include/net/route.h index 1eb9ce470e25..158833ea7988 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -65,8 +65,6 @@ struct rtable { /* Miscellaneous cached information */ u32 rt_pmtu; - u32 rt_table_id; - struct list_head rt_uncached; struct uncached_list *rt_uncached_list; }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6ce623e3e2ab..5ca7415cd48c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1509,7 +1509,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_pmtu = 0; rt->rt_gateway = 0; rt->rt_uses_gateway = 0; - rt->rt_table_id = 0; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output; @@ -1727,8 +1726,6 @@ rt_cache: } rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; @@ -2001,8 +1998,6 @@ local_input: rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { @@ -2231,8 +2226,6 @@ add: return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; - if (res->table) - rth->rt_table_id = res->table->tb_id; RT_CACHE_STAT_INC(out_slow_tot); @@ -2762,7 +2755,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) - table_id = rt->rt_table_id; + table_id = res.table ? res.table->tb_id : 0; if (rtm->rtm_flags & RTM_F_FIB_MATCH) { if (!res.fi) { diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 753f526cf9db..796ac4115485 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -100,7 +100,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_gateway = rt->rt_gateway; xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; - xdst->u.rt.rt_table_id = rt->rt_table_id; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); return 0; -- cgit v1.2.3-70-g09d2 From 27469b7352b5197cffa0e3dadb5f1127f055da27 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:42 +0100 Subject: tipc: remove redundant code in topology server The socket handling in the topology server is unnecessarily generic. It is prepared to handle both SOCK_RDM, SOCK_DGRAM and SOCK_STREAM type sockets, as well as the only socket type which is really used, SOCK_SEQPACKET. We now remove this redundant code to make the code more readable. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 36 +++++++----------------------------- net/tipc/server.h | 4 +--- net/tipc/subscr.c | 4 +--- 3 files changed, 9 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index df0c563c90cd..04a6dd99dd65 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -82,7 +82,6 @@ struct tipc_conn { struct outqueue_entry { struct list_head list; struct kvec iov; - struct sockaddr_tipc dest; }; static void tipc_recv_work(struct work_struct *work); @@ -93,7 +92,6 @@ static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); struct tipc_server *s = con->server; - struct sockaddr_tipc *saddr = s->saddr; struct socket *sock = con->sock; struct sock *sk; @@ -103,8 +101,6 @@ static void tipc_conn_kref_release(struct kref *kref) __module_get(sock->ops->owner); __module_get(sk->sk_prot_creator->owner); } - saddr->scope = -TIPC_NODE_SCOPE; - kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); sock_release(sock); con->sock = NULL; } @@ -325,36 +321,24 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con) { struct tipc_server *s = con->server; struct socket *sock = NULL; + int imp = TIPC_CRITICAL_IMPORTANCE; int ret; ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); if (ret < 0) return NULL; ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&s->imp, sizeof(s->imp)); + (char *)&imp, sizeof(imp)); if (ret < 0) goto create_err; ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr)); if (ret < 0) goto create_err; - switch (s->type) { - case SOCK_STREAM: - case SOCK_SEQPACKET: - con->rx_action = tipc_accept_from_sock; - - ret = kernel_listen(sock, 0); - if (ret < 0) - goto create_err; - break; - case SOCK_DGRAM: - case SOCK_RDM: - con->rx_action = tipc_receive_from_sock; - break; - default: - pr_err("Unknown socket type %d\n", s->type); + con->rx_action = tipc_accept_from_sock; + ret = kernel_listen(sock, 0); + if (ret < 0) goto create_err; - } /* As server's listening socket owner and creator is the same module, * we have to decrease TIPC module reference count to guarantee that @@ -444,7 +428,7 @@ static void tipc_clean_outqueues(struct tipc_conn *con) } int tipc_conn_sendmsg(struct tipc_server *s, int conid, - struct sockaddr_tipc *addr, void *data, size_t len) + void *data, size_t len) { struct outqueue_entry *e; struct tipc_conn *con; @@ -464,9 +448,6 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, return -ENOMEM; } - if (addr) - memcpy(&e->dest, addr, sizeof(struct sockaddr_tipc)); - spin_lock_bh(&con->outqueue_lock); list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); @@ -575,10 +556,6 @@ static void tipc_send_to_sock(struct tipc_conn *con) if (con->sock) { memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { - msg.msg_name = &e->dest; - msg.msg_namelen = sizeof(struct sockaddr_tipc); - } ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, e->iov.iov_len); if (ret == -EWOULDBLOCK || ret == 0) { @@ -591,6 +568,7 @@ static void tipc_send_to_sock(struct tipc_conn *con) evt = e->iov.iov_base; tipc_send_kern_top_evt(s->net, evt); } + /* Don't starve users filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); diff --git a/net/tipc/server.h b/net/tipc/server.h index 64df7513cd70..434736d545c2 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -79,12 +79,10 @@ struct tipc_server { void *buf, size_t len); struct sockaddr_tipc *saddr; char name[TIPC_SERVER_NAME_LEN]; - int imp; - int type; }; int tipc_conn_sendmsg(struct tipc_server *s, int conid, - struct sockaddr_tipc *addr, void *data, size_t len); + void *data, size_t len); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 68e26470c516..eaef826fc06d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -81,7 +81,7 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, sub->evt.found_upper = htohl(found_upper, sub->swap); sub->evt.port.ref = htohl(port_ref, sub->swap); sub->evt.port.node = htohl(node, sub->swap); - tipc_conn_sendmsg(tn->topsrv, subscriber->conid, NULL, + tipc_conn_sendmsg(tn->topsrv, subscriber->conid, msg_sect.iov_base, msg_sect.iov_len); } @@ -375,8 +375,6 @@ int tipc_topsrv_start(struct net *net) } topsrv->net = net; topsrv->saddr = saddr; - topsrv->imp = TIPC_CRITICAL_IMPORTANCE; - topsrv->type = SOCK_SEQPACKET; topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; topsrv->tipc_conn_new = tipc_subscrb_connect_cb; -- cgit v1.2.3-70-g09d2 From c901d26d4a8137f3ad0e5865d331f7c63c42d9f9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:43 +0100 Subject: tipc: remove unnecessary function pointers Interaction between the functionality in server.c and subscr.c is done via function pointers installed in struct server. This makes the code harder to follow, and doesn't serve any obvious purpose. Here, we replace the function pointers with direct function calls. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 21 ++++++++++----------- net/tipc/server.h | 5 ----- net/tipc/subscr.c | 27 ++++++--------------------- net/tipc/subscr.h | 4 ++++ 4 files changed, 20 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 04a6dd99dd65..8aa2a33b1e48 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -33,6 +33,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include "subscr.h" #include "server.h" #include "core.h" #include "socket.h" @@ -182,7 +183,6 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) static void tipc_close_conn(struct tipc_conn *con) { - struct tipc_server *s = con->server; struct sock *sk = con->sock->sk; bool disconnect = false; @@ -191,7 +191,7 @@ static void tipc_close_conn(struct tipc_conn *con) if (disconnect) { sk->sk_user_data = NULL; if (con->conid) - s->tipc_conn_release(con->conid, con->usr_data); + tipc_subscrb_delete(con->usr_data); } write_unlock_bh(&sk->sk_callback_lock); @@ -240,7 +240,6 @@ static int tipc_receive_from_sock(struct tipc_conn *con) { struct tipc_server *s = con->server; struct sock *sk = con->sock->sk; - struct sockaddr_tipc addr; struct msghdr msg = {}; struct kvec iov; void *buf; @@ -254,7 +253,7 @@ static int tipc_receive_from_sock(struct tipc_conn *con) iov.iov_base = buf; iov.iov_len = s->max_rcvbuf_size; - msg.msg_name = &addr; + msg.msg_name = NULL; iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); if (ret <= 0) { @@ -264,8 +263,8 @@ static int tipc_receive_from_sock(struct tipc_conn *con) read_lock_bh(&sk->sk_callback_lock); if (test_bit(CF_CONNECTED, &con->flags)) - ret = s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid, - &addr, con->usr_data, buf, ret); + ret = tipc_subscrb_rcv(sock_net(con->sock->sk), con->conid, + con->usr_data, buf, ret); read_unlock_bh(&sk->sk_callback_lock); kmem_cache_free(s->rcvbuf_cache, buf); if (ret < 0) @@ -284,7 +283,6 @@ out_close: static int tipc_accept_from_sock(struct tipc_conn *con) { - struct tipc_server *s = con->server; struct socket *sock = con->sock; struct socket *newsock; struct tipc_conn *newcon; @@ -305,7 +303,8 @@ static int tipc_accept_from_sock(struct tipc_conn *con) tipc_register_callbacks(newsock, newcon); /* Notify that new connection is incoming */ - newcon->usr_data = s->tipc_conn_new(newcon->conid); + newcon->usr_data = tipc_subscrb_create(newcon->conid); + if (!newcon->usr_data) { sock_release(newsock); conn_put(newcon); @@ -489,7 +488,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; s = con->server; - scbr = s->tipc_conn_new(*conid); + scbr = tipc_subscrb_create(*conid); if (!scbr) { conn_put(con); return false; @@ -497,7 +496,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, con->usr_data = scbr; con->sock = NULL; - s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub)); + tipc_subscrb_rcv(net, *conid, scbr, &sub, sizeof(sub)); return true; } @@ -513,7 +512,7 @@ void tipc_topsrv_kern_unsubscr(struct net *net, int conid) test_and_clear_bit(CF_CONNECTED, &con->flags); srv = con->server; if (con->conid) - srv->tipc_conn_release(con->conid, con->usr_data); + tipc_subscrb_delete(con->usr_data); conn_put(con); conn_put(con); } diff --git a/net/tipc/server.h b/net/tipc/server.h index 434736d545c2..b4b83bdc8b20 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -72,11 +72,6 @@ struct tipc_server { struct workqueue_struct *rcv_wq; struct workqueue_struct *send_wq; int max_rcvbuf_size; - void *(*tipc_conn_new)(int conid); - void (*tipc_conn_release)(int conid, void *usr_data); - int (*tipc_conn_recvmsg)(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len); struct sockaddr_tipc *saddr; char name[TIPC_SERVER_NAME_LEN]; }; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index eaef826fc06d..b86fbbf7a0b9 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -220,7 +220,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, spin_unlock_bh(&subscriber->lock); } -static struct tipc_subscriber *tipc_subscrb_create(int conid) +struct tipc_subscriber *tipc_subscrb_create(int conid) { struct tipc_subscriber *subscriber; @@ -237,7 +237,7 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid) return subscriber; } -static void tipc_subscrb_delete(struct tipc_subscriber *subscriber) +void tipc_subscrb_delete(struct tipc_subscriber *subscriber) { tipc_subscrb_subscrp_delete(subscriber, NULL); tipc_subscrb_put(subscriber); @@ -315,16 +315,10 @@ static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, return 0; } -/* Handle one termination request for the subscriber */ -static void tipc_subscrb_release_cb(int conid, void *usr_data) -{ - tipc_subscrb_delete((struct tipc_subscriber *)usr_data); -} - -/* Handle one request to create a new subscription for the subscriber */ -static int tipc_subscrb_rcv_cb(struct net *net, int conid, - struct sockaddr_tipc *addr, void *usr_data, - void *buf, size_t len) +/* Handle one request to create a new subscription for the subscriber + */ +int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data, + void *buf, size_t len) { struct tipc_subscriber *subscriber = usr_data; struct tipc_subscr *s = (struct tipc_subscr *)buf; @@ -345,12 +339,6 @@ static int tipc_subscrb_rcv_cb(struct net *net, int conid, return tipc_subscrp_subscribe(net, s, subscriber, swap, status); } -/* Handle one request to establish a new subscriber */ -static void *tipc_subscrb_connect_cb(int conid) -{ - return (void *)tipc_subscrb_create(conid); -} - int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); @@ -376,9 +364,6 @@ int tipc_topsrv_start(struct net *net) topsrv->net = net; topsrv->saddr = saddr; topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; - topsrv->tipc_conn_new = tipc_subscrb_connect_cb; - topsrv->tipc_conn_release = tipc_subscrb_release_cb; strncpy(topsrv->name, name, strlen(name) + 1); tn->topsrv = topsrv; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index f3edca775d9f..a736f29ba9ab 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -67,6 +67,10 @@ struct tipc_subscription { struct tipc_event evt; }; +struct tipc_subscriber *tipc_subscrb_create(int conid); +void tipc_subscrb_delete(struct tipc_subscriber *subscriber); +int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data, + void *buf, size_t len); int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, -- cgit v1.2.3-70-g09d2 From df79d040dcd7d7e580c50edf40b82e677fe84801 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:44 +0100 Subject: tipc: eliminate struct tipc_subscriber It is unnecessary to keep two structures, struct tipc_conn and struct tipc_subscriber, with a one-to-one relationship and still with different life cycles. The fact that the two often run in different contexts, and still may access each other via direct pointers constitutes an additional hazard, something we have experienced at several occasions, and still see happening. We have identified at least two remaining problems that are easier to fix if we simplify the topology server data structure somewhat. - When there is a race between a subscription up/down event and a timeout event, it is fully possible that the former might be delivered after the latter, leading to confusion for the receiver. - The function tipc_subcrp_timeout() is executing in interrupt context, while the following call chain is at least theoretically possible: tipc_subscrp_timeout() tipc_subscrp_send_event() tipc_conn_sendmsg() conn_put() tipc_conn_kref_release() sock_release(sock) I.e., we end up calling a function that might try to sleep in interrupt context. To eliminate this, we need to ensure that the tipc_conn structure and the socket, as well as the subscription instances, only are deleted in work queue context, i.e., after the timeout event really has been sent out. We now remove this unnecessary complexity, by merging data and functionality of the subscriber structure into struct tipc_conn and the associated file server.c. We thereafter add a spinlock and a new 'inactive' state to the subscription structure. Using those, both problems described above can be easily solved. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 161 ++++++++++++++++++++++++++++++++------------------ net/tipc/server.h | 2 +- net/tipc/subscr.c | 173 ++++++++++-------------------------------------------- net/tipc/subscr.h | 17 +++--- 4 files changed, 146 insertions(+), 207 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 8aa2a33b1e48..b8268c0882a7 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -2,6 +2,7 @@ * net/tipc/server.c: TIPC server infrastructure * * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,12 +58,13 @@ * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server + * @sub_list: lsit to all pertaing subscriptions + * @sub_lock: lock protecting the subscription list + * @outqueue_lock: control access to the outqueue * @rwork: receive work item - * @usr_data: user-specified field * @rx_action: what to do when connection socket is active * @outqueue: pointer to first outbound message in queue * @outqueue_lock: control access to the outqueue - * @outqueue: list of connection objects for its server * @swork: send work item */ struct tipc_conn { @@ -71,9 +73,10 @@ struct tipc_conn { struct socket *sock; unsigned long flags; struct tipc_server *server; + struct list_head sub_list; + spinlock_t sub_lock; /* for subscription list */ struct work_struct rwork; int (*rx_action) (struct tipc_conn *con); - void *usr_data; struct list_head outqueue; spinlock_t outqueue_lock; struct work_struct swork; @@ -81,6 +84,7 @@ struct tipc_conn { /* An entry waiting to be sent */ struct outqueue_entry { + u32 evt; struct list_head list; struct kvec iov; }; @@ -89,18 +93,33 @@ static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); static void tipc_clean_outqueues(struct tipc_conn *con); +static bool connected(struct tipc_conn *con) +{ + return con && test_bit(CF_CONNECTED, &con->flags); +} + +/** + * htohl - convert value to endianness used by destination + * @in: value to convert + * @swap: non-zero if endianness must be reversed + * + * Returns converted value + */ +static u32 htohl(u32 in, int swap) +{ + return swap ? swab32(in) : in; +} + static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); struct tipc_server *s = con->server; struct socket *sock = con->sock; - struct sock *sk; if (sock) { - sk = sock->sk; if (test_bit(CF_SERVER, &con->flags)) { __module_get(sock->ops->owner); - __module_get(sk->sk_prot_creator->owner); + __module_get(sock->sk->sk_prot_creator->owner); } sock_release(sock); con->sock = NULL; @@ -129,11 +148,8 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) spin_lock_bh(&s->idr_lock); con = idr_find(&s->conn_idr, conid); - if (con) { - if (!test_bit(CF_CONNECTED, &con->flags) || - !kref_get_unless_zero(&con->kref)) - con = NULL; - } + if (!connected(con) || !kref_get_unless_zero(&con->kref)) + con = NULL; spin_unlock_bh(&s->idr_lock); return con; } @@ -144,7 +160,7 @@ static void sock_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); - if (con && test_bit(CF_CONNECTED, &con->flags)) { + if (connected(con)) { conn_get(con); if (!queue_work(con->server->rcv_wq, &con->rwork)) conn_put(con); @@ -158,7 +174,7 @@ static void sock_write_space(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); - if (con && test_bit(CF_CONNECTED, &con->flags)) { + if (connected(con)) { conn_get(con); if (!queue_work(con->server->send_wq, &con->swork)) conn_put(con); @@ -181,6 +197,24 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) write_unlock_bh(&sk->sk_callback_lock); } +/* tipc_con_delete_sub - delete a specific or all subscriptions + * for a given subscriber + */ +static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) +{ + struct list_head *sub_list = &con->sub_list; + struct tipc_subscription *sub, *tmp; + + spin_lock_bh(&con->sub_lock); + list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) { + if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) + tipc_sub_delete(sub); + else if (s) + break; + } + spin_unlock_bh(&con->sub_lock); +} + static void tipc_close_conn(struct tipc_conn *con) { struct sock *sk = con->sock->sk; @@ -188,10 +222,11 @@ static void tipc_close_conn(struct tipc_conn *con) write_lock_bh(&sk->sk_callback_lock); disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); + if (disconnect) { sk->sk_user_data = NULL; if (con->conid) - tipc_subscrb_delete(con->usr_data); + tipc_con_delete_sub(con, NULL); } write_unlock_bh(&sk->sk_callback_lock); @@ -215,7 +250,9 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) kref_init(&con->kref); INIT_LIST_HEAD(&con->outqueue); + INIT_LIST_HEAD(&con->sub_list); spin_lock_init(&con->outqueue_lock); + spin_lock_init(&con->sub_lock); INIT_WORK(&con->swork, tipc_send_work); INIT_WORK(&con->rwork, tipc_recv_work); @@ -236,6 +273,35 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) return con; } +int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con, + void *buf, size_t len) +{ + struct tipc_subscr *s = (struct tipc_subscr *)buf; + struct tipc_subscription *sub; + bool status; + int swap; + + /* Determine subscriber's endianness */ + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | + TIPC_SUB_CANCEL)); + + /* Detect & process a subscription cancellation request */ + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + tipc_con_delete_sub(con, s); + return 0; + } + status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); + sub = tipc_subscrp_subscribe(net, s, conid, swap, status); + if (!sub) + return -1; + + spin_lock_bh(&con->sub_lock); + list_add(&sub->subscrp_list, &con->sub_list); + spin_unlock_bh(&con->sub_lock); + return 0; +} + static int tipc_receive_from_sock(struct tipc_conn *con) { struct tipc_server *s = con->server; @@ -262,9 +328,7 @@ static int tipc_receive_from_sock(struct tipc_conn *con) } read_lock_bh(&sk->sk_callback_lock); - if (test_bit(CF_CONNECTED, &con->flags)) - ret = tipc_subscrb_rcv(sock_net(con->sock->sk), con->conid, - con->usr_data, buf, ret); + ret = tipc_con_rcv_sub(s->net, con->conid, con, buf, ret); read_unlock_bh(&sk->sk_callback_lock); kmem_cache_free(s->rcvbuf_cache, buf); if (ret < 0) @@ -302,15 +366,6 @@ static int tipc_accept_from_sock(struct tipc_conn *con) newcon->rx_action = tipc_receive_from_sock; tipc_register_callbacks(newsock, newcon); - /* Notify that new connection is incoming */ - newcon->usr_data = tipc_subscrb_create(newcon->conid); - - if (!newcon->usr_data) { - sock_release(newsock); - conn_put(newcon); - return -ENOMEM; - } - /* Wake up receive process in case of 'SYN+' message */ newsock->sk->sk_data_ready(newsock->sk); return ret; @@ -427,7 +482,7 @@ static void tipc_clean_outqueues(struct tipc_conn *con) } int tipc_conn_sendmsg(struct tipc_server *s, int conid, - void *data, size_t len) + u32 evt, void *data, size_t len) { struct outqueue_entry *e; struct tipc_conn *con; @@ -436,7 +491,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, if (!con) return -EINVAL; - if (!test_bit(CF_CONNECTED, &con->flags)) { + if (!connected(con)) { conn_put(con); return 0; } @@ -446,7 +501,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid, conn_put(con); return -ENOMEM; } - + e->evt = evt; spin_lock_bh(&con->outqueue_lock); list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); @@ -470,10 +525,9 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid) { - struct tipc_subscriber *scbr; struct tipc_subscr sub; - struct tipc_server *s; struct tipc_conn *con; + int rc; sub.seq.type = type; sub.seq.lower = lower; @@ -487,32 +541,23 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, return false; *conid = con->conid; - s = con->server; - scbr = tipc_subscrb_create(*conid); - if (!scbr) { - conn_put(con); - return false; - } - - con->usr_data = scbr; con->sock = NULL; - tipc_subscrb_rcv(net, *conid, scbr, &sub, sizeof(sub)); - return true; + rc = tipc_con_rcv_sub(net, *conid, con, &sub, sizeof(sub)); + if (rc < 0) + tipc_close_conn(con); + return !rc; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) { struct tipc_conn *con; - struct tipc_server *srv; con = tipc_conn_lookup(tipc_topsrv(net), conid); if (!con) return; test_and_clear_bit(CF_CONNECTED, &con->flags); - srv = con->server; - if (con->conid) - tipc_subscrb_delete(con->usr_data); + tipc_con_delete_sub(con, NULL); conn_put(con); conn_put(con); } @@ -537,7 +582,8 @@ static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) static void tipc_send_to_sock(struct tipc_conn *con) { - struct tipc_server *s = con->server; + struct list_head *queue = &con->outqueue; + struct tipc_server *srv = con->server; struct outqueue_entry *e; struct tipc_event *evt; struct msghdr msg; @@ -545,16 +591,20 @@ static void tipc_send_to_sock(struct tipc_conn *con) int ret; spin_lock_bh(&con->outqueue_lock); - while (test_bit(CF_CONNECTED, &con->flags)) { - e = list_entry(con->outqueue.next, struct outqueue_entry, list); - if ((struct list_head *) e == &con->outqueue) - break; + + while (!list_empty(queue)) { + e = list_first_entry(queue, struct outqueue_entry, list); spin_unlock_bh(&con->outqueue_lock); + if (e->evt == TIPC_SUBSCR_TIMEOUT) { + evt = (struct tipc_event *)e->iov.iov_base; + tipc_con_delete_sub(con, &evt->s); + } + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + if (con->sock) { - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, e->iov.iov_len); if (ret == -EWOULDBLOCK || ret == 0) { @@ -565,7 +615,7 @@ static void tipc_send_to_sock(struct tipc_conn *con) } } else { evt = e->iov.iov_base; - tipc_send_kern_top_evt(s->net, evt); + tipc_send_kern_top_evt(srv->net, evt); } /* Don't starve users filling buffers */ @@ -573,7 +623,6 @@ static void tipc_send_to_sock(struct tipc_conn *con) cond_resched(); count = 0; } - spin_lock_bh(&con->outqueue_lock); list_del(&e->list); tipc_free_entry(e); @@ -591,7 +640,7 @@ static void tipc_recv_work(struct work_struct *work) struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); int count = 0; - while (test_bit(CF_CONNECTED, &con->flags)) { + while (connected(con)) { if (con->rx_action(con)) break; @@ -608,7 +657,7 @@ static void tipc_send_work(struct work_struct *work) { struct tipc_conn *con = container_of(work, struct tipc_conn, swork); - if (test_bit(CF_CONNECTED, &con->flags)) + if (connected(con)) tipc_send_to_sock(con); conn_put(con); diff --git a/net/tipc/server.h b/net/tipc/server.h index b4b83bdc8b20..fcc72321362d 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -77,7 +77,7 @@ struct tipc_server { }; int tipc_conn_sendmsg(struct tipc_server *s, int conid, - void *data, size_t len); + u32 evt, void *data, size_t len); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index b86fbbf7a0b9..c6de1452db69 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -1,7 +1,7 @@ /* * net/tipc/subscr.c: TIPC network topology service * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2017, Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -38,22 +38,6 @@ #include "name_table.h" #include "subscr.h" -/** - * struct tipc_subscriber - TIPC network topology subscriber - * @kref: reference counter to tipc_subscription object - * @conid: connection identifier to server connecting to subscriber - * @lock: control access to subscriber - * @subscrp_list: list of subscription objects for this subscriber - */ -struct tipc_subscriber { - struct kref kref; - int conid; - spinlock_t lock; - struct list_head subscrp_list; -}; - -static void tipc_subscrb_put(struct tipc_subscriber *subscriber); - /** * htohl - convert value to endianness used by destination * @in: value to convert @@ -71,9 +55,10 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, u32 event, u32 port_ref, u32 node) { struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - struct tipc_subscriber *subscriber = sub->subscriber; struct kvec msg_sect; + if (sub->inactive) + return; msg_sect.iov_base = (void *)&sub->evt; msg_sect.iov_len = sizeof(struct tipc_event); sub->evt.event = htohl(event, sub->swap); @@ -81,7 +66,7 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, sub->evt.found_upper = htohl(found_upper, sub->swap); sub->evt.port.ref = htohl(port_ref, sub->swap); sub->evt.port.node = htohl(node, sub->swap); - tipc_conn_sendmsg(tn->topsrv, subscriber->conid, + tipc_conn_sendmsg(tn->topsrv, sub->conid, event, msg_sect.iov_base, msg_sect.iov_len); } @@ -132,41 +117,22 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, return; if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; - - tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, - node); + spin_lock(&sub->lock); + tipc_subscrp_send_event(sub, found_lower, found_upper, + event, port_ref, node); + spin_unlock(&sub->lock); } static void tipc_subscrp_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); - struct tipc_subscriber *subscriber = sub->subscriber; - - spin_lock_bh(&subscriber->lock); - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - spin_unlock_bh(&subscriber->lock); + struct tipc_subscr *s = &sub->evt.s; - /* Notify subscriber of timeout */ - tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper, + spin_lock(&sub->lock); + tipc_subscrp_send_event(sub, s->seq.lower, s->seq.upper, TIPC_SUBSCR_TIMEOUT, 0, 0); - - tipc_subscrp_put(sub); -} - -static void tipc_subscrb_kref_release(struct kref *kref) -{ - kfree(container_of(kref,struct tipc_subscriber, kref)); -} - -static void tipc_subscrb_put(struct tipc_subscriber *subscriber) -{ - kref_put(&subscriber->kref, tipc_subscrb_kref_release); -} - -static void tipc_subscrb_get(struct tipc_subscriber *subscriber) -{ - kref_get(&subscriber->kref); + sub->inactive = true; + spin_unlock(&sub->lock); } static void tipc_subscrp_kref_release(struct kref *kref) @@ -175,11 +141,9 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_subscription, kref); struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - struct tipc_subscriber *subscriber = sub->subscriber; atomic_dec(&tn->subscription_count); kfree(sub); - tipc_subscrb_put(subscriber); } void tipc_subscrp_put(struct tipc_subscription *subscription) @@ -192,68 +156,9 @@ void tipc_subscrp_get(struct tipc_subscription *subscription) kref_get(&subscription->kref); } -/* tipc_subscrb_subscrp_delete - delete a specific subscription or all - * subscriptions for a given subscriber. - */ -static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, - struct tipc_subscr *s) -{ - struct list_head *subscription_list = &subscriber->subscrp_list; - struct tipc_subscription *sub, *temp; - u32 timeout; - - spin_lock_bh(&subscriber->lock); - list_for_each_entry_safe(sub, temp, subscription_list, subscrp_list) { - if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) - continue; - - timeout = htohl(sub->evt.s.timeout, sub->swap); - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) { - tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); - tipc_subscrp_put(sub); - } - - if (s) - break; - } - spin_unlock_bh(&subscriber->lock); -} - -struct tipc_subscriber *tipc_subscrb_create(int conid) -{ - struct tipc_subscriber *subscriber; - - subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC); - if (!subscriber) { - pr_warn("Subscriber rejected, no memory\n"); - return NULL; - } - INIT_LIST_HEAD(&subscriber->subscrp_list); - kref_init(&subscriber->kref); - subscriber->conid = conid; - spin_lock_init(&subscriber->lock); - - return subscriber; -} - -void tipc_subscrb_delete(struct tipc_subscriber *subscriber) -{ - tipc_subscrb_subscrp_delete(subscriber, NULL); - tipc_subscrb_put(subscriber); -} - -static void tipc_subscrp_cancel(struct tipc_subscr *s, - struct tipc_subscriber *subscriber) -{ - tipc_subscrb_get(subscriber); - tipc_subscrb_subscrp_delete(subscriber, s); - tipc_subscrb_put(subscriber); -} - static struct tipc_subscription *tipc_subscrp_create(struct net *net, struct tipc_subscr *s, - int swap) + int conid, bool swap) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_subscription *sub; @@ -275,6 +180,8 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, /* Initialize subscription object */ sub->net = net; + sub->conid = conid; + sub->inactive = false; if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { pr_warn("Subscription rejected, illegal request\n"); @@ -284,59 +191,39 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); + spin_lock_init(&sub->lock); atomic_inc(&tn->subscription_count); kref_init(&sub->kref); return sub; } -static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, - struct tipc_subscriber *subscriber, int swap, - bool status) +struct tipc_subscription *tipc_subscrp_subscribe(struct net *net, + struct tipc_subscr *s, + int conid, bool swap, + bool status) { struct tipc_subscription *sub = NULL; u32 timeout; - sub = tipc_subscrp_create(net, s, swap); + sub = tipc_subscrp_create(net, s, conid, swap); if (!sub) - return -1; + return NULL; - spin_lock_bh(&subscriber->lock); - list_add(&sub->subscrp_list, &subscriber->subscrp_list); - sub->subscriber = subscriber; tipc_nametbl_subscribe(sub, status); - tipc_subscrb_get(subscriber); - spin_unlock_bh(&subscriber->lock); - timer_setup(&sub->timer, tipc_subscrp_timeout, 0); timeout = htohl(sub->evt.s.timeout, swap); - if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); - return 0; + return sub; } -/* Handle one request to create a new subscription for the subscriber - */ -int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data, - void *buf, size_t len) +void tipc_sub_delete(struct tipc_subscription *sub) { - struct tipc_subscriber *subscriber = usr_data; - struct tipc_subscr *s = (struct tipc_subscr *)buf; - bool status; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | - TIPC_SUB_CANCEL)); - - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); - tipc_subscrp_cancel(s, subscriber); - return 0; - } - status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); - return tipc_subscrp_subscribe(net, s, subscriber, swap, status); + tipc_nametbl_unsubscribe(sub); + if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) + del_timer_sync(&sub->timer); + list_del(&sub->subscrp_list); + tipc_subscrp_put(sub); } int tipc_topsrv_start(struct net *net) diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index a736f29ba9ab..cfd0cb3a1da8 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -43,7 +43,7 @@ #define TIPC_MAX_PUBLICATIONS 65535 struct tipc_subscription; -struct tipc_subscriber; +struct tipc_conn; /** * struct tipc_subscription - TIPC network topology subscription object @@ -58,19 +58,22 @@ struct tipc_subscriber; */ struct tipc_subscription { struct kref kref; - struct tipc_subscriber *subscriber; struct net *net; struct timer_list timer; struct list_head nameseq_list; struct list_head subscrp_list; - int swap; struct tipc_event evt; + int conid; + bool swap; + bool inactive; + spinlock_t lock; /* serialize up/down and timer events */ }; -struct tipc_subscriber *tipc_subscrb_create(int conid); -void tipc_subscrb_delete(struct tipc_subscriber *subscriber); -int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data, - void *buf, size_t len); +struct tipc_subscription *tipc_subscrp_subscribe(struct net *net, + struct tipc_subscr *s, + int conid, bool swap, + bool status); +void tipc_sub_delete(struct tipc_subscription *sub); int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, -- cgit v1.2.3-70-g09d2 From 414574a0af36d329f560f542e650cc4a81cc1d69 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:45 +0100 Subject: tipc: simplify interaction between subscription and topology connection The message transmission and reception in the topology server is more generic than is currently necessary. By basing the funtionality on the fact that we only send items of type struct tipc_event and always receive items of struct tipc_subcr we can make several simplifications, and also get rid of some unnecessary dynamic memory allocations. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 10 +-- net/tipc/server.c | 170 +++++++++++++++++--------------------------------- net/tipc/server.h | 12 +--- net/tipc/subscr.c | 40 ++++++------ net/tipc/subscr.h | 5 +- 5 files changed, 88 insertions(+), 149 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index ed0457cc99d6..c0ca7be46f7a 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -810,14 +810,15 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, */ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) { - struct tipc_net *tn = net_generic(s->net, tipc_net_id); + struct tipc_server *srv = s->server; + struct tipc_net *tn = tipc_net(srv->net); u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); int index = hash(type); struct name_seq *seq; struct tipc_name_seq ns; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, type); + seq = nametbl_find_seq(srv->net, type); if (!seq) seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); if (seq) { @@ -837,12 +838,13 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) */ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) { - struct tipc_net *tn = net_generic(s->net, tipc_net_id); + struct tipc_server *srv = s->server; + struct tipc_net *tn = tipc_net(srv->net); struct name_seq *seq; u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(s->net, type); + seq = nametbl_find_seq(srv->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); diff --git a/net/tipc/server.c b/net/tipc/server.c index b8268c0882a7..7933fb92d852 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -84,9 +84,9 @@ struct tipc_conn { /* An entry waiting to be sent */ struct outqueue_entry { - u32 evt; + bool inactive; + struct tipc_event evt; struct list_head list; - struct kvec iov; }; static void tipc_recv_work(struct work_struct *work); @@ -154,6 +154,9 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) return con; } +/* sock_data_ready - interrupt callback indicating the socket has data to read + * The queued job is launched in tipc_recv_from_sock() + */ static void sock_data_ready(struct sock *sk) { struct tipc_conn *con; @@ -168,6 +171,10 @@ static void sock_data_ready(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } +/* sock_write_space - interrupt callback after a sendmsg EAGAIN + * Indicates that there now is more is space in the send buffer + * The queued job is launched in tipc_send_to_sock() + */ static void sock_write_space(struct sock *sk) { struct tipc_conn *con; @@ -273,10 +280,10 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) return con; } -int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con, - void *buf, size_t len) +static int tipc_con_rcv_sub(struct tipc_server *srv, + struct tipc_conn *con, + struct tipc_subscr *s) { - struct tipc_subscr *s = (struct tipc_subscr *)buf; struct tipc_subscription *sub; bool status; int swap; @@ -292,7 +299,7 @@ int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con, return 0; } status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); - sub = tipc_subscrp_subscribe(net, s, conid, swap, status); + sub = tipc_subscrp_subscribe(srv, s, con->conid, swap, status); if (!sub) return -1; @@ -304,43 +311,27 @@ int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con, static int tipc_receive_from_sock(struct tipc_conn *con) { - struct tipc_server *s = con->server; + struct tipc_server *srv = con->server; struct sock *sk = con->sock->sk; struct msghdr msg = {}; + struct tipc_subscr s; struct kvec iov; - void *buf; int ret; - buf = kmem_cache_alloc(s->rcvbuf_cache, GFP_ATOMIC); - if (!buf) { - ret = -ENOMEM; - goto out_close; - } - - iov.iov_base = buf; - iov.iov_len = s->max_rcvbuf_size; + iov.iov_base = &s; + iov.iov_len = sizeof(s); msg.msg_name = NULL; iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); - if (ret <= 0) { - kmem_cache_free(s->rcvbuf_cache, buf); - goto out_close; + if (ret == -EWOULDBLOCK) + return -EWOULDBLOCK; + if (ret > 0) { + read_lock_bh(&sk->sk_callback_lock); + ret = tipc_con_rcv_sub(srv, con, &s); + read_unlock_bh(&sk->sk_callback_lock); } - - read_lock_bh(&sk->sk_callback_lock); - ret = tipc_con_rcv_sub(s->net, con->conid, con, buf, ret); - read_unlock_bh(&sk->sk_callback_lock); - kmem_cache_free(s->rcvbuf_cache, buf); if (ret < 0) - tipc_conn_terminate(s, con->conid); - return ret; - -out_close: - if (ret != -EWOULDBLOCK) tipc_close_conn(con); - else if (ret == 0) - /* Don't return success if we really got EOF */ - ret = -EAGAIN; return ret; } @@ -442,33 +433,6 @@ static int tipc_open_listening_sock(struct tipc_server *s) return 0; } -static struct outqueue_entry *tipc_alloc_entry(void *data, int len) -{ - struct outqueue_entry *entry; - void *buf; - - entry = kmalloc(sizeof(struct outqueue_entry), GFP_ATOMIC); - if (!entry) - return NULL; - - buf = kmemdup(data, len, GFP_ATOMIC); - if (!buf) { - kfree(entry); - return NULL; - } - - entry->iov.iov_base = buf; - entry->iov.iov_len = len; - - return entry; -} - -static void tipc_free_entry(struct outqueue_entry *e) -{ - kfree(e->iov.iov_base); - kfree(e); -} - static void tipc_clean_outqueues(struct tipc_conn *con) { struct outqueue_entry *e, *safe; @@ -476,50 +440,40 @@ static void tipc_clean_outqueues(struct tipc_conn *con) spin_lock_bh(&con->outqueue_lock); list_for_each_entry_safe(e, safe, &con->outqueue, list) { list_del(&e->list); - tipc_free_entry(e); + kfree(e); } spin_unlock_bh(&con->outqueue_lock); } -int tipc_conn_sendmsg(struct tipc_server *s, int conid, - u32 evt, void *data, size_t len) +/* tipc_conn_queue_evt - interrupt level call from a subscription instance + * The queued job is launched in tipc_send_to_sock() + */ +void tipc_conn_queue_evt(struct tipc_server *s, int conid, + u32 event, struct tipc_event *evt) { struct outqueue_entry *e; struct tipc_conn *con; con = tipc_conn_lookup(s, conid); if (!con) - return -EINVAL; + return; - if (!connected(con)) { - conn_put(con); - return 0; - } + if (!connected(con)) + goto err; - e = tipc_alloc_entry(data, len); - if (!e) { - conn_put(con); - return -ENOMEM; - } - e->evt = evt; + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + goto err; + e->inactive = (event == TIPC_SUBSCR_TIMEOUT); + memcpy(&e->evt, evt, sizeof(*evt)); spin_lock_bh(&con->outqueue_lock); list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); - if (!queue_work(s->send_wq, &con->swork)) - conn_put(con); - return 0; -} - -void tipc_conn_terminate(struct tipc_server *s, int conid) -{ - struct tipc_conn *con; - - con = tipc_conn_lookup(s, conid); - if (con) { - tipc_close_conn(con); - conn_put(con); - } + if (queue_work(s->send_wq, &con->swork)) + return; +err: + conn_put(con); } bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, @@ -542,7 +496,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; con->sock = NULL; - rc = tipc_con_rcv_sub(net, *conid, con, &sub, sizeof(sub)); + rc = tipc_con_rcv_sub(tipc_topsrv(net), con, &sub); if (rc < 0) tipc_close_conn(con); return !rc; @@ -587,6 +541,7 @@ static void tipc_send_to_sock(struct tipc_conn *con) struct outqueue_entry *e; struct tipc_event *evt; struct msghdr msg; + struct kvec iov; int count = 0; int ret; @@ -594,27 +549,28 @@ static void tipc_send_to_sock(struct tipc_conn *con) while (!list_empty(queue)) { e = list_first_entry(queue, struct outqueue_entry, list); - + evt = &e->evt; spin_unlock_bh(&con->outqueue_lock); - if (e->evt == TIPC_SUBSCR_TIMEOUT) { - evt = (struct tipc_event *)e->iov.iov_base; + if (e->inactive) tipc_con_delete_sub(con, &evt->s); - } + memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; + iov.iov_base = evt; + iov.iov_len = sizeof(*evt); + msg.msg_name = NULL; if (con->sock) { - ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, - e->iov.iov_len); + ret = kernel_sendmsg(con->sock, &msg, &iov, + 1, sizeof(*evt)); if (ret == -EWOULDBLOCK || ret == 0) { cond_resched(); goto out; } else if (ret < 0) { - goto send_err; + goto err; } } else { - evt = e->iov.iov_base; tipc_send_kern_top_evt(srv->net, evt); } @@ -625,13 +581,12 @@ static void tipc_send_to_sock(struct tipc_conn *con) } spin_lock_bh(&con->outqueue_lock); list_del(&e->list); - tipc_free_entry(e); + kfree(e); } spin_unlock_bh(&con->outqueue_lock); out: return; - -send_err: +err: tipc_close_conn(con); } @@ -695,22 +650,14 @@ int tipc_server_start(struct tipc_server *s) idr_init(&s->conn_idr); s->idr_in_use = 0; - s->rcvbuf_cache = kmem_cache_create(s->name, s->max_rcvbuf_size, - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!s->rcvbuf_cache) - return -ENOMEM; - ret = tipc_work_start(s); - if (ret < 0) { - kmem_cache_destroy(s->rcvbuf_cache); + if (ret < 0) return ret; - } + ret = tipc_open_listening_sock(s); - if (ret < 0) { + if (ret < 0) tipc_work_stop(s); - kmem_cache_destroy(s->rcvbuf_cache); - return ret; - } + return ret; } @@ -731,6 +678,5 @@ void tipc_server_stop(struct tipc_server *s) spin_unlock_bh(&s->idr_lock); tipc_work_stop(s); - kmem_cache_destroy(s->rcvbuf_cache); idr_destroy(&s->conn_idr); } diff --git a/net/tipc/server.h b/net/tipc/server.h index fcc72321362d..2de8709b467d 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -36,6 +36,7 @@ #ifndef _TIPC_SERVER_H #define _TIPC_SERVER_H +#include "core.h" #include #include #include @@ -68,7 +69,6 @@ struct tipc_server { spinlock_t idr_lock; int idr_in_use; struct net *net; - struct kmem_cache *rcvbuf_cache; struct workqueue_struct *rcv_wq; struct workqueue_struct *send_wq; int max_rcvbuf_size; @@ -76,19 +76,13 @@ struct tipc_server { char name[TIPC_SERVER_NAME_LEN]; }; -int tipc_conn_sendmsg(struct tipc_server *s, int conid, - u32 evt, void *data, size_t len); +void tipc_conn_queue_evt(struct tipc_server *s, int conid, + u32 event, struct tipc_event *evt); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); -/** - * tipc_conn_terminate - terminate connection with server - * - * Note: Must call it in process context since it might sleep - */ -void tipc_conn_terminate(struct tipc_server *s, int conid); int tipc_server_start(struct tipc_server *s); void tipc_server_stop(struct tipc_server *s); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index c6de1452db69..c3e0b924e8c2 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -52,22 +52,19 @@ static u32 htohl(u32 in, int swap) static void tipc_subscrp_send_event(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, - u32 event, u32 port_ref, u32 node) + u32 event, u32 port, u32 node) { - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); - struct kvec msg_sect; + struct tipc_event *evt = &sub->evt; + bool swap = sub->swap; if (sub->inactive) return; - msg_sect.iov_base = (void *)&sub->evt; - msg_sect.iov_len = sizeof(struct tipc_event); - sub->evt.event = htohl(event, sub->swap); - sub->evt.found_lower = htohl(found_lower, sub->swap); - sub->evt.found_upper = htohl(found_upper, sub->swap); - sub->evt.port.ref = htohl(port_ref, sub->swap); - sub->evt.port.node = htohl(node, sub->swap); - tipc_conn_sendmsg(tn->topsrv, sub->conid, event, - msg_sect.iov_base, msg_sect.iov_len); + evt->event = htohl(event, swap); + evt->found_lower = htohl(found_lower, swap); + evt->found_upper = htohl(found_upper, swap); + evt->port.ref = htohl(port, swap); + evt->port.node = htohl(node, swap); + tipc_conn_queue_evt(sub->server, sub->conid, event, evt); } /** @@ -137,10 +134,11 @@ static void tipc_subscrp_timeout(struct timer_list *t) static void tipc_subscrp_kref_release(struct kref *kref) { - struct tipc_subscription *sub = container_of(kref, - struct tipc_subscription, - kref); - struct tipc_net *tn = net_generic(sub->net, tipc_net_id); + struct tipc_subscription *sub; + struct tipc_net *tn; + + sub = container_of(kref, struct tipc_subscription, kref); + tn = tipc_net(sub->server->net); atomic_dec(&tn->subscription_count); kfree(sub); @@ -156,11 +154,11 @@ void tipc_subscrp_get(struct tipc_subscription *subscription) kref_get(&subscription->kref); } -static struct tipc_subscription *tipc_subscrp_create(struct net *net, +static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv, struct tipc_subscr *s, int conid, bool swap) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; u32 filter = htohl(s->filter, swap); @@ -179,7 +177,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, } /* Initialize subscription object */ - sub->net = net; + sub->server = srv; sub->conid = conid; sub->inactive = false; if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || @@ -197,7 +195,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net, return sub; } -struct tipc_subscription *tipc_subscrp_subscribe(struct net *net, +struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, struct tipc_subscr *s, int conid, bool swap, bool status) @@ -205,7 +203,7 @@ struct tipc_subscription *tipc_subscrp_subscribe(struct net *net, struct tipc_subscription *sub = NULL; u32 timeout; - sub = tipc_subscrp_create(net, s, conid, swap); + sub = tipc_subscrp_create(srv, s, conid, swap); if (!sub) return NULL; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index cfd0cb3a1da8..64ffd32d15e6 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -49,7 +49,6 @@ struct tipc_conn; * struct tipc_subscription - TIPC network topology subscription object * @subscriber: pointer to its subscriber * @seq: name sequence associated with subscription - * @net: point to network namespace * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscrp_list: adjacent subscriptions in subscriber's subscription list @@ -58,7 +57,7 @@ struct tipc_conn; */ struct tipc_subscription { struct kref kref; - struct net *net; + struct tipc_server *server; struct timer_list timer; struct list_head nameseq_list; struct list_head subscrp_list; @@ -69,7 +68,7 @@ struct tipc_subscription { spinlock_t lock; /* serialize up/down and timer events */ }; -struct tipc_subscription *tipc_subscrp_subscribe(struct net *net, +struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, struct tipc_subscr *s, int conid, bool swap, bool status); -- cgit v1.2.3-70-g09d2 From 8985ecc7c1e07c42acc1e44ac56fa224f8a5c62f Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:46 +0100 Subject: tipc: simplify endianness handling in topology subscriber Because of the requirement for total distribution transparency, users send subscriptions and receive topology events in their own host format. It is up to the topology server to determine this format and do the correct conversions to and from its own host format when needed. Until now, this has been handled in a rather non-transparent way inside the topology server and subscriber code, leading to unnecessary complexity when creating subscriptions and issuing events. We now improve this situation by adding two new macros, tipc_sub_read() and tipc_evt_write(). Both those functions calculate the need for conversion internally before performing their respective operations. Hence, all handling of such conversions become transparent to the rest of the code. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 42 +++++++++++++++----------- net/tipc/name_table.h | 2 +- net/tipc/server.c | 25 ++-------------- net/tipc/subscr.c | 83 +++++++++++++++++++-------------------------------- net/tipc/subscr.h | 36 ++++++++++++++++------ 5 files changed, 86 insertions(+), 102 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index c0ca7be46f7a..2fbd0a20a958 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -412,18 +412,22 @@ found: * sequence overlapping with the requested sequence */ static void tipc_nameseq_subscribe(struct name_seq *nseq, - struct tipc_subscription *s, - bool status) + struct tipc_subscription *sub) { struct sub_seq *sseq = nseq->sseqs; struct tipc_name_seq ns; + struct tipc_subscr *s = &sub->evt.s; + bool no_status; - tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); + ns.type = tipc_sub_read(s, seq.type); + ns.lower = tipc_sub_read(s, seq.lower); + ns.upper = tipc_sub_read(s, seq.upper); + no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS; - tipc_subscrp_get(s); - list_add(&s->nameseq_list, &nseq->subscriptions); + tipc_subscrp_get(sub); + list_add(&sub->nameseq_list, &nseq->subscriptions); - if (!status || !sseq) + if (no_status || !sseq) return; while (sseq != &nseq->sseqs[nseq->first_free]) { @@ -433,7 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, int must_report = 1; list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscrp_report_overlap(s, sseq->lower, + tipc_subscrp_report_overlap(sub, sseq->lower, sseq->upper, TIPC_PUBLISHED, crs->ref, crs->node, @@ -808,11 +812,12 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, /** * tipc_nametbl_subscribe - add a subscription object to the name table */ -void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) +void tipc_nametbl_subscribe(struct tipc_subscription *sub) { - struct tipc_server *srv = s->server; + struct tipc_server *srv = sub->server; struct tipc_net *tn = tipc_net(srv->net); - u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); + struct tipc_subscr *s = &sub->evt.s; + u32 type = tipc_sub_read(s, seq.type); int index = hash(type); struct name_seq *seq; struct tipc_name_seq ns; @@ -823,10 +828,12 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); if (seq) { spin_lock_bh(&seq->lock); - tipc_nameseq_subscribe(seq, s, status); + tipc_nameseq_subscribe(seq, sub); spin_unlock_bh(&seq->lock); } else { - tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); + ns.type = tipc_sub_read(s, seq.type); + ns.lower = tipc_sub_read(s, seq.lower); + ns.upper = tipc_sub_read(s, seq.upper); pr_warn("Failed to create subscription for {%u,%u,%u}\n", ns.type, ns.lower, ns.upper); } @@ -836,19 +843,20 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status) /** * tipc_nametbl_unsubscribe - remove a subscription object from name table */ -void tipc_nametbl_unsubscribe(struct tipc_subscription *s) +void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { - struct tipc_server *srv = s->server; + struct tipc_server *srv = sub->server; + struct tipc_subscr *s = &sub->evt.s; struct tipc_net *tn = tipc_net(srv->net); struct name_seq *seq; - u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap); + u32 type = tipc_sub_read(s, seq.type); spin_lock_bh(&tn->nametbl_lock); seq = nametbl_find_seq(srv->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); - list_del_init(&s->nameseq_list); - tipc_subscrp_put(s); + list_del_init(&sub->nameseq_list); + tipc_subscrp_put(sub); if (!seq->first_free && list_empty(&seq->subscriptions)) { hlist_del_init_rcu(&seq->ns_list); kfree(seq->sseqs); diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f56e7cb3d436..17652602d5e2 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -120,7 +120,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, u32 lower, u32 node, u32 ref, u32 key); -void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status); +void tipc_nametbl_subscribe(struct tipc_subscription *s); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); diff --git a/net/tipc/server.c b/net/tipc/server.c index 7933fb92d852..5d231fa8e4b5 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -98,18 +98,6 @@ static bool connected(struct tipc_conn *con) return con && test_bit(CF_CONNECTED, &con->flags); } -/** - * htohl - convert value to endianness used by destination - * @in: value to convert - * @swap: non-zero if endianness must be reversed - * - * Returns converted value - */ -static u32 htohl(u32 in, int swap) -{ - return swap ? swab32(in) : in; -} - static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); @@ -285,21 +273,12 @@ static int tipc_con_rcv_sub(struct tipc_server *srv, struct tipc_subscr *s) { struct tipc_subscription *sub; - bool status; - int swap; - - /* Determine subscriber's endianness */ - swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | - TIPC_SUB_CANCEL)); - /* Detect & process a subscription cancellation request */ - if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { - s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); + if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { tipc_con_delete_sub(con, s); return 0; } - status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap)); - sub = tipc_subscrp_subscribe(srv, s, con->conid, swap, status); + sub = tipc_subscrp_subscribe(srv, s, con->conid); if (!sub) return -1; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index c3e0b924e8c2..406b09fc227b 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -38,32 +38,19 @@ #include "name_table.h" #include "subscr.h" -/** - * htohl - convert value to endianness used by destination - * @in: value to convert - * @swap: non-zero if endianness must be reversed - * - * Returns converted value - */ -static u32 htohl(u32 in, int swap) -{ - return swap ? swab32(in) : in; -} - static void tipc_subscrp_send_event(struct tipc_subscription *sub, u32 found_lower, u32 found_upper, u32 event, u32 port, u32 node) { struct tipc_event *evt = &sub->evt; - bool swap = sub->swap; if (sub->inactive) return; - evt->event = htohl(event, swap); - evt->found_lower = htohl(found_lower, swap); - evt->found_upper = htohl(found_upper, swap); - evt->port.ref = htohl(port, swap); - evt->port.node = htohl(node, swap); + tipc_evt_write(evt, event, event); + tipc_evt_write(evt, found_lower, found_lower); + tipc_evt_write(evt, found_upper, found_upper); + tipc_evt_write(evt, port.ref, port); + tipc_evt_write(evt, port.node, node); tipc_conn_queue_evt(sub->server, sub->conid, event, evt); } @@ -85,29 +72,22 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, return 1; } -u32 tipc_subscrp_convert_seq_type(u32 type, int swap) +void tipc_subscrp_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must) { - return htohl(type, swap); -} - -void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, - struct tipc_name_seq *out) -{ - out->type = htohl(in->type, swap); - out->lower = htohl(in->lower, swap); - out->upper = htohl(in->upper, swap); -} - -void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, - u32 found_upper, u32 event, u32 port_ref, - u32 node, u32 scope, int must) -{ - u32 filter = htohl(sub->evt.s.filter, sub->swap); struct tipc_name_seq seq; + struct tipc_subscr *s = &sub->evt.s; + u32 filter = tipc_sub_read(s, filter); + + seq.type = tipc_sub_read(s, seq.type); + seq.lower = tipc_sub_read(s, seq.lower); + seq.upper = tipc_sub_read(s, seq.upper); - tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; + if (!must && !(filter & TIPC_SUB_PORTS)) return; if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE) @@ -116,7 +96,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, return; spin_lock(&sub->lock); tipc_subscrp_send_event(sub, found_lower, found_upper, - event, port_ref, node); + event, port, node); spin_unlock(&sub->lock); } @@ -156,11 +136,11 @@ void tipc_subscrp_get(struct tipc_subscription *subscription) static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv, struct tipc_subscr *s, - int conid, bool swap) + int conid) { struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; - u32 filter = htohl(s->filter, swap); + u32 filter = tipc_sub_read(s, filter); /* Refuse subscription if global limit exceeded */ if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { @@ -177,39 +157,38 @@ static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv, } /* Initialize subscription object */ + if (filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) + goto err; + if (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper)) + goto err; sub->server = srv; sub->conid = conid; sub->inactive = false; - if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) || - (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) { - pr_warn("Subscription rejected, illegal request\n"); - kfree(sub); - return NULL; - } - - sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(*s)); spin_lock_init(&sub->lock); atomic_inc(&tn->subscription_count); kref_init(&sub->kref); return sub; +err: + pr_warn("Subscription rejected, illegal request\n"); + kfree(sub); + return NULL; } struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, struct tipc_subscr *s, - int conid, bool swap, - bool status) + int conid) { struct tipc_subscription *sub = NULL; u32 timeout; - sub = tipc_subscrp_create(srv, s, conid, swap); + sub = tipc_subscrp_create(srv, s, conid); if (!sub) return NULL; - tipc_nametbl_subscribe(sub, status); + tipc_nametbl_subscribe(sub); timer_setup(&sub->timer, tipc_subscrp_timeout, 0); - timeout = htohl(sub->evt.s.timeout, swap); + timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); return sub; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 64ffd32d15e6..db80e41bba08 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -52,7 +52,6 @@ struct tipc_conn; * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscrp_list: adjacent subscriptions in subscriber's subscription list - * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription */ struct tipc_subscription { @@ -63,28 +62,47 @@ struct tipc_subscription { struct list_head subscrp_list; struct tipc_event evt; int conid; - bool swap; bool inactive; spinlock_t lock; /* serialize up/down and timer events */ }; struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, struct tipc_subscr *s, - int conid, bool swap, - bool status); + int conid); void tipc_sub_delete(struct tipc_subscription *sub); + int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); void tipc_subscrp_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, u32 event, - u32 port_ref, u32 node, u32 scope, int must); -void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap, - struct tipc_name_seq *out); -u32 tipc_subscrp_convert_seq_type(u32 type, int swap); + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); void tipc_subscrp_put(struct tipc_subscription *subscription); void tipc_subscrp_get(struct tipc_subscription *subscription); +#define TIPC_FILTER_MASK (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | TIPC_SUB_CANCEL) + +/* tipc_sub_read - return field_ of struct sub_ in host endian format + */ +#define tipc_sub_read(sub_, field_) \ + ({ \ + struct tipc_subscr *sub__ = sub_; \ + u32 val__ = (sub__)->field_; \ + int swap_ = !((sub__)->filter & TIPC_FILTER_MASK); \ + (swap_ ? swab32(val__) : val__); \ + }) + +/* tipc_evt_write - write val_ to field_ of struct evt_ in user endian format + */ +#define tipc_evt_write(evt_, field_, val_) \ + ({ \ + struct tipc_event *evt__ = evt_; \ + u32 val__ = val_; \ + int swap_ = !((evt__)->s.filter & (TIPC_FILTER_MASK)); \ + (evt__)->field_ = swap_ ? swab32(val__) : val__; \ + }) + #endif -- cgit v1.2.3-70-g09d2 From 242e82cc95f6b4e83e1771f9915edcb2a63708e1 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:47 +0100 Subject: tipc: collapse subscription creation functions After the previous changes it becomes logical to collapse the two-level creation of subscription instances into one. We do that here. We also rename the creation and deletion functions for more consistency. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 4 ++-- net/tipc/server.h | 1 + net/tipc/subscr.c | 46 ++++++++++++---------------------------------- net/tipc/subscr.h | 14 +++++++------- 4 files changed, 22 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 5d231fa8e4b5..6a18b10ac271 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -203,7 +203,7 @@ static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) spin_lock_bh(&con->sub_lock); list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) { if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) - tipc_sub_delete(sub); + tipc_sub_unsubscribe(sub); else if (s) break; } @@ -278,7 +278,7 @@ static int tipc_con_rcv_sub(struct tipc_server *srv, tipc_con_delete_sub(con, s); return 0; } - sub = tipc_subscrp_subscribe(srv, s, con->conid); + sub = tipc_sub_subscribe(srv, s, con->conid); if (!sub) return -1; diff --git a/net/tipc/server.h b/net/tipc/server.h index 2de8709b467d..995b79591ffe 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -2,6 +2,7 @@ * net/tipc/server.h: Include file for TIPC server code * * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 406b09fc227b..8d37b61e3163 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -134,33 +134,29 @@ void tipc_subscrp_get(struct tipc_subscription *subscription) kref_get(&subscription->kref); } -static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv, - struct tipc_subscr *s, - int conid) +struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, + struct tipc_subscr *s, + int conid) { struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; u32 filter = tipc_sub_read(s, filter); + u32 timeout; - /* Refuse subscription if global limit exceeded */ - if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) { - pr_warn("Subscription rejected, limit reached (%u)\n", - TIPC_MAX_SUBSCRIPTIONS); + if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { + pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); + return NULL; + } + if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || + (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) { + pr_warn("Subscription rejected, illegal request\n"); return NULL; } - - /* Allocate subscription object */ sub = kmalloc(sizeof(*sub), GFP_ATOMIC); if (!sub) { pr_warn("Subscription rejected, no memory\n"); return NULL; } - - /* Initialize subscription object */ - if (filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) - goto err; - if (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper)) - goto err; sub->server = srv; sub->conid = conid; sub->inactive = false; @@ -168,24 +164,6 @@ static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv, spin_lock_init(&sub->lock); atomic_inc(&tn->subscription_count); kref_init(&sub->kref); - return sub; -err: - pr_warn("Subscription rejected, illegal request\n"); - kfree(sub); - return NULL; -} - -struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, - struct tipc_subscr *s, - int conid) -{ - struct tipc_subscription *sub = NULL; - u32 timeout; - - sub = tipc_subscrp_create(srv, s, conid); - if (!sub) - return NULL; - tipc_nametbl_subscribe(sub); timer_setup(&sub->timer, tipc_subscrp_timeout, 0); timeout = tipc_sub_read(&sub->evt.s, timeout); @@ -194,7 +172,7 @@ struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, return sub; } -void tipc_sub_delete(struct tipc_subscription *sub) +void tipc_sub_unsubscribe(struct tipc_subscription *sub) { tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index db80e41bba08..2d35f1046754 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -1,7 +1,7 @@ /* * net/tipc/subscr.h: Include file for TIPC network topology service * - * Copyright (c) 2003-2006, Ericsson AB + * Copyright (c) 2003-2017, Ericsson AB * Copyright (c) 2005-2007, 2012-2013, Wind River Systems * All rights reserved. * @@ -39,8 +39,8 @@ #include "server.h" -#define TIPC_MAX_SUBSCRIPTIONS 65535 -#define TIPC_MAX_PUBLICATIONS 65535 +#define TIPC_MAX_SUBSCR 65535 +#define TIPC_MAX_PUBLICATIONS 65535 struct tipc_subscription; struct tipc_conn; @@ -66,10 +66,10 @@ struct tipc_subscription { spinlock_t lock; /* serialize up/down and timer events */ }; -struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv, - struct tipc_subscr *s, - int conid); -void tipc_sub_delete(struct tipc_subscription *sub); +struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, + struct tipc_subscr *s, + int conid); +void tipc_sub_unsubscribe(struct tipc_subscription *sub); int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, u32 found_upper); -- cgit v1.2.3-70-g09d2 From da0a75e86ae230f92743c073843d3ea35bd061af Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:48 +0100 Subject: tipc: some prefix changes Since we now have removed struct tipc_subscriber from the code, and only struct tipc_subscription remains, there is no longer need for long and awkward prefixes to distinguish between their pertaining functions. We now change all tipc_subscrp_* prefixes to tipc_sub_*. This is a purely cosmetic change. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 33 ++++++++++++++++---------------- net/tipc/server.c | 5 ++--- net/tipc/subscr.c | 52 +++++++++++++++++++++++++-------------------------- net/tipc/subscr.h | 20 ++++++++++---------- 4 files changed, 54 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 2fbd0a20a958..b234b7ee0b84 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -326,10 +326,10 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscrp_report_overlap(s, publ->lower, publ->upper, - TIPC_PUBLISHED, publ->ref, - publ->node, publ->scope, - created_subseq); + tipc_sub_report_overlap(s, publ->lower, publ->upper, + TIPC_PUBLISHED, publ->ref, + publ->node, publ->scope, + created_subseq); } return publ; } @@ -397,10 +397,9 @@ found: /* Notify any waiting subscriptions */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_subscrp_report_overlap(s, publ->lower, publ->upper, - TIPC_WITHDRAWN, publ->ref, - publ->node, publ->scope, - removed_subseq); + tipc_sub_report_overlap(s, publ->lower, publ->upper, + TIPC_WITHDRAWN, publ->ref, publ->node, + publ->scope, removed_subseq); } return publ; @@ -424,25 +423,25 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, ns.upper = tipc_sub_read(s, seq.upper); no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS; - tipc_subscrp_get(sub); + tipc_sub_get(sub); list_add(&sub->nameseq_list, &nseq->subscriptions); if (no_status || !sseq) return; while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) { + if (tipc_sub_check_overlap(&ns, sseq->lower, sseq->upper)) { struct publication *crs; struct name_info *info = sseq->info; int must_report = 1; list_for_each_entry(crs, &info->zone_list, zone_list) { - tipc_subscrp_report_overlap(sub, sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->ref, crs->node, - crs->scope, - must_report); + tipc_sub_report_overlap(sub, sseq->lower, + sseq->upper, + TIPC_PUBLISHED, + crs->ref, crs->node, + crs->scope, + must_report); must_report = 0; } } @@ -856,7 +855,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&sub->nameseq_list); - tipc_subscrp_put(sub); + tipc_sub_put(sub); if (!seq->first_free && list_empty(&seq->subscriptions)) { hlist_del_init_rcu(&seq->ns_list); kfree(seq->sseqs); diff --git a/net/tipc/server.c b/net/tipc/server.c index 6a18b10ac271..a5c112ed3bbe 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -201,7 +201,7 @@ static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) struct tipc_subscription *sub, *tmp; spin_lock_bh(&con->sub_lock); - list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) { + list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) tipc_sub_unsubscribe(sub); else if (s) @@ -281,9 +281,8 @@ static int tipc_con_rcv_sub(struct tipc_server *srv, sub = tipc_sub_subscribe(srv, s, con->conid); if (!sub) return -1; - spin_lock_bh(&con->sub_lock); - list_add(&sub->subscrp_list, &con->sub_list); + list_add(&sub->sub_list, &con->sub_list); spin_unlock_bh(&con->sub_lock); return 0; } diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 8d37b61e3163..3be1e4b6cbfa 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -38,9 +38,9 @@ #include "name_table.h" #include "subscr.h" -static void tipc_subscrp_send_event(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node) +static void tipc_sub_send_event(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node) { struct tipc_event *evt = &sub->evt; @@ -55,13 +55,13 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub, } /** - * tipc_subscrp_check_overlap - test for subscription overlap with the + * tipc_sub_check_overlap - test for subscription overlap with the * given values * * Returns 1 if there is overlap, otherwise 0. */ -int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper) +int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, + u32 found_upper) { if (found_lower < seq->lower) found_lower = seq->lower; @@ -72,20 +72,20 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, return 1; } -void tipc_subscrp_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node, - u32 scope, int must) +void tipc_sub_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must) { - struct tipc_name_seq seq; struct tipc_subscr *s = &sub->evt.s; u32 filter = tipc_sub_read(s, filter); + struct tipc_name_seq seq; seq.type = tipc_sub_read(s, seq.type); seq.lower = tipc_sub_read(s, seq.lower); seq.upper = tipc_sub_read(s, seq.upper); - if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) + if (!tipc_sub_check_overlap(&seq, found_lower, found_upper)) return; if (!must && !(filter & TIPC_SUB_PORTS)) @@ -95,24 +95,24 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE) return; spin_lock(&sub->lock); - tipc_subscrp_send_event(sub, found_lower, found_upper, - event, port, node); + tipc_sub_send_event(sub, found_lower, found_upper, + event, port, node); spin_unlock(&sub->lock); } -static void tipc_subscrp_timeout(struct timer_list *t) +static void tipc_sub_timeout(struct timer_list *t) { struct tipc_subscription *sub = from_timer(sub, t, timer); struct tipc_subscr *s = &sub->evt.s; spin_lock(&sub->lock); - tipc_subscrp_send_event(sub, s->seq.lower, s->seq.upper, - TIPC_SUBSCR_TIMEOUT, 0, 0); + tipc_sub_send_event(sub, s->seq.lower, s->seq.upper, + TIPC_SUBSCR_TIMEOUT, 0, 0); sub->inactive = true; spin_unlock(&sub->lock); } -static void tipc_subscrp_kref_release(struct kref *kref) +static void tipc_sub_kref_release(struct kref *kref) { struct tipc_subscription *sub; struct tipc_net *tn; @@ -124,12 +124,12 @@ static void tipc_subscrp_kref_release(struct kref *kref) kfree(sub); } -void tipc_subscrp_put(struct tipc_subscription *subscription) +void tipc_sub_put(struct tipc_subscription *subscription) { - kref_put(&subscription->kref, tipc_subscrp_kref_release); + kref_put(&subscription->kref, tipc_sub_kref_release); } -void tipc_subscrp_get(struct tipc_subscription *subscription) +void tipc_sub_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } @@ -139,8 +139,8 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, int conid) { struct tipc_net *tn = tipc_net(srv->net); - struct tipc_subscription *sub; u32 filter = tipc_sub_read(s, filter); + struct tipc_subscription *sub; u32 timeout; if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { @@ -165,7 +165,7 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, atomic_inc(&tn->subscription_count); kref_init(&sub->kref); tipc_nametbl_subscribe(sub); - timer_setup(&sub->timer, tipc_subscrp_timeout, 0); + timer_setup(&sub->timer, tipc_sub_timeout, 0); timeout = tipc_sub_read(&sub->evt.s, timeout); if (timeout != TIPC_WAIT_FOREVER) mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); @@ -177,16 +177,16 @@ void tipc_sub_unsubscribe(struct tipc_subscription *sub) tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) del_timer_sync(&sub->timer); - list_del(&sub->subscrp_list); - tipc_subscrp_put(sub); + list_del(&sub->sub_list); + tipc_sub_put(sub); } int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); const char name[] = "topology_server"; - struct tipc_server *topsrv; struct sockaddr_tipc *saddr; + struct tipc_server *topsrv; saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC); if (!saddr) diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 2d35f1046754..720932896ba6 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -51,7 +51,7 @@ struct tipc_conn; * @seq: name sequence associated with subscription * @timer: timer governing subscription duration (optional) * @nameseq_list: adjacent subscriptions in name sequence's subscription list - * @subscrp_list: adjacent subscriptions in subscriber's subscription list + * @sub_list: adjacent subscriptions in subscriber's subscription list * @evt: template for events generated by subscription */ struct tipc_subscription { @@ -59,7 +59,7 @@ struct tipc_subscription { struct tipc_server *server; struct timer_list timer; struct list_head nameseq_list; - struct list_head subscrp_list; + struct list_head sub_list; struct tipc_event evt; int conid; bool inactive; @@ -71,17 +71,17 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, int conid); void tipc_sub_unsubscribe(struct tipc_subscription *sub); -int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower, - u32 found_upper); -void tipc_subscrp_report_overlap(struct tipc_subscription *sub, - u32 found_lower, u32 found_upper, - u32 event, u32 port, u32 node, - u32 scope, int must); +int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower, + u32 found_upper); +void tipc_sub_report_overlap(struct tipc_subscription *sub, + u32 found_lower, u32 found_upper, + u32 event, u32 port, u32 node, + u32 scope, int must); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); -void tipc_subscrp_put(struct tipc_subscription *subscription); -void tipc_subscrp_get(struct tipc_subscription *subscription); +void tipc_sub_put(struct tipc_subscription *subscription); +void tipc_sub_get(struct tipc_subscription *subscription); #define TIPC_FILTER_MASK (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | TIPC_SUB_CANCEL) -- cgit v1.2.3-70-g09d2 From 5c45ab24ac77ea32eae7d3576cf37c3ddb259f80 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:49 +0100 Subject: tipc: make struct tipc_server private for server.c In order to narrow the interface and dependencies between the topology server and the subscription/binding table functionality we move struct tipc_server inside the file server.c. This requires some code adaptations in other files, but those are mostly minor. The most important change is that we have to move the start/stop functions for the topology server to server.c, where they logically belong anyway. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 10 ++-- net/tipc/server.c | 124 ++++++++++++++++++++++++++++++++++++++++---------- net/tipc/server.h | 36 +-------------- net/tipc/subscr.c | 64 ++------------------------ net/tipc/subscr.h | 4 +- 5 files changed, 110 insertions(+), 128 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index b234b7ee0b84..e01c9c691ba2 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -813,8 +813,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, */ void tipc_nametbl_subscribe(struct tipc_subscription *sub) { - struct tipc_server *srv = sub->server; - struct tipc_net *tn = tipc_net(srv->net); + struct tipc_net *tn = tipc_net(sub->net); struct tipc_subscr *s = &sub->evt.s; u32 type = tipc_sub_read(s, seq.type); int index = hash(type); @@ -822,7 +821,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub) struct tipc_name_seq ns; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(srv->net, type); + seq = nametbl_find_seq(sub->net, type); if (!seq) seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); if (seq) { @@ -844,14 +843,13 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub) */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { - struct tipc_server *srv = sub->server; struct tipc_subscr *s = &sub->evt.s; - struct tipc_net *tn = tipc_net(srv->net); + struct tipc_net *tn = tipc_net(sub->net); struct name_seq *seq; u32 type = tipc_sub_read(s, seq.type); spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(srv->net, type); + seq = nametbl_find_seq(sub->net, type); if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&sub->nameseq_list); diff --git a/net/tipc/server.c b/net/tipc/server.c index a5c112ed3bbe..0abbdd698662 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -49,7 +49,37 @@ #define CF_CONNECTED 1 #define CF_SERVER 2 -#define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data) +#define TIPC_SERVER_NAME_LEN 32 + +/** + * struct tipc_server - TIPC server structure + * @conn_idr: identifier set of connection + * @idr_lock: protect the connection identifier set + * @idr_in_use: amount of allocated identifier entry + * @net: network namspace instance + * @rcvbuf_cache: memory cache of server receive buffer + * @rcv_wq: receive workqueue + * @send_wq: send workqueue + * @max_rcvbuf_size: maximum permitted receive message length + * @tipc_conn_new: callback will be called when new connection is incoming + * @tipc_conn_release: callback will be called before releasing the connection + * @tipc_conn_recvmsg: callback will be called when message arrives + * @saddr: TIPC server address + * @name: server name + * @imp: message importance + * @type: socket type + */ +struct tipc_server { + struct idr conn_idr; + spinlock_t idr_lock; /* for idr list */ + int idr_in_use; + struct net *net; + struct workqueue_struct *rcv_wq; + struct workqueue_struct *send_wq; + int max_rcvbuf_size; + struct sockaddr_tipc *saddr; + char name[TIPC_SERVER_NAME_LEN]; +}; /** * struct tipc_conn - TIPC connection structure @@ -93,6 +123,11 @@ static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); static void tipc_clean_outqueues(struct tipc_conn *con); +static struct tipc_conn *sock2con(struct sock *sk) +{ + return sk->sk_user_data; +} + static bool connected(struct tipc_conn *con) { return con && test_bit(CF_CONNECTED, &con->flags); @@ -198,14 +233,17 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) { struct list_head *sub_list = &con->sub_list; + struct tipc_net *tn = tipc_net(con->server->net); struct tipc_subscription *sub, *tmp; spin_lock_bh(&con->sub_lock); list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { - if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) + if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { tipc_sub_unsubscribe(sub); - else if (s) + atomic_dec(&tn->subscription_count); + } else if (s) { break; + } } spin_unlock_bh(&con->sub_lock); } @@ -220,8 +258,7 @@ static void tipc_close_conn(struct tipc_conn *con) if (disconnect) { sk->sk_user_data = NULL; - if (con->conid) - tipc_con_delete_sub(con, NULL); + tipc_con_delete_sub(con, NULL); } write_unlock_bh(&sk->sk_callback_lock); @@ -272,15 +309,21 @@ static int tipc_con_rcv_sub(struct tipc_server *srv, struct tipc_conn *con, struct tipc_subscr *s) { + struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { tipc_con_delete_sub(con, s); return 0; } - sub = tipc_sub_subscribe(srv, s, con->conid); + if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { + pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); + return -1; + } + sub = tipc_sub_subscribe(srv->net, s, con->conid); if (!sub) return -1; + atomic_inc(&tn->subscription_count); spin_lock_bh(&con->sub_lock); list_add(&sub->sub_list, &con->sub_list); spin_unlock_bh(&con->sub_lock); @@ -426,13 +469,14 @@ static void tipc_clean_outqueues(struct tipc_conn *con) /* tipc_conn_queue_evt - interrupt level call from a subscription instance * The queued job is launched in tipc_send_to_sock() */ -void tipc_conn_queue_evt(struct tipc_server *s, int conid, +void tipc_conn_queue_evt(struct net *net, int conid, u32 event, struct tipc_event *evt) { + struct tipc_server *srv = tipc_topsrv(net); struct outqueue_entry *e; struct tipc_conn *con; - con = tipc_conn_lookup(s, conid); + con = tipc_conn_lookup(srv, conid); if (!con) return; @@ -448,7 +492,7 @@ void tipc_conn_queue_evt(struct tipc_server *s, int conid, list_add_tail(&e->list, &con->outqueue); spin_unlock_bh(&con->outqueue_lock); - if (queue_work(s->send_wq, &con->swork)) + if (queue_work(srv->send_wq, &con->swork)) return; err: conn_put(con); @@ -620,41 +664,71 @@ static int tipc_work_start(struct tipc_server *s) return 0; } -int tipc_server_start(struct tipc_server *s) +int tipc_topsrv_start(struct net *net) { + struct tipc_net *tn = tipc_net(net); + const char name[] = "topology_server"; + struct sockaddr_tipc *saddr; + struct tipc_server *srv; int ret; - spin_lock_init(&s->idr_lock); - idr_init(&s->conn_idr); - s->idr_in_use = 0; + saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC); + if (!saddr) + return -ENOMEM; + saddr->family = AF_TIPC; + saddr->addrtype = TIPC_ADDR_NAMESEQ; + saddr->addr.nameseq.type = TIPC_TOP_SRV; + saddr->addr.nameseq.lower = TIPC_TOP_SRV; + saddr->addr.nameseq.upper = TIPC_TOP_SRV; + saddr->scope = TIPC_NODE_SCOPE; + + srv = kzalloc(sizeof(*srv), GFP_ATOMIC); + if (!srv) { + kfree(saddr); + return -ENOMEM; + } + srv->net = net; + srv->saddr = saddr; + srv->max_rcvbuf_size = sizeof(struct tipc_subscr); + + strncpy(srv->name, name, strlen(name) + 1); + tn->topsrv = srv; + atomic_set(&tn->subscription_count, 0); + + spin_lock_init(&srv->idr_lock); + idr_init(&srv->conn_idr); + srv->idr_in_use = 0; - ret = tipc_work_start(s); + ret = tipc_work_start(srv); if (ret < 0) return ret; - ret = tipc_open_listening_sock(s); + ret = tipc_open_listening_sock(srv); if (ret < 0) - tipc_work_stop(s); + tipc_work_stop(srv); return ret; } -void tipc_server_stop(struct tipc_server *s) +void tipc_topsrv_stop(struct net *net) { + struct tipc_server *srv = tipc_topsrv(net); struct tipc_conn *con; int id; - spin_lock_bh(&s->idr_lock); - for (id = 0; s->idr_in_use; id++) { - con = idr_find(&s->conn_idr, id); + spin_lock_bh(&srv->idr_lock); + for (id = 0; srv->idr_in_use; id++) { + con = idr_find(&srv->conn_idr, id); if (con) { - spin_unlock_bh(&s->idr_lock); + spin_unlock_bh(&srv->idr_lock); tipc_close_conn(con); - spin_lock_bh(&s->idr_lock); + spin_lock_bh(&srv->idr_lock); } } - spin_unlock_bh(&s->idr_lock); + spin_unlock_bh(&srv->idr_lock); - tipc_work_stop(s); - idr_destroy(&s->conn_idr); + tipc_work_stop(srv); + idr_destroy(&srv->conn_idr); + kfree(srv->saddr); + kfree(srv); } diff --git a/net/tipc/server.h b/net/tipc/server.h index 995b79591ffe..ce93b6d68e41 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -47,45 +47,11 @@ #define TIPC_SUB_NODE_SCOPE 0x40 #define TIPC_SUB_NO_STATUS 0x80 -/** - * struct tipc_server - TIPC server structure - * @conn_idr: identifier set of connection - * @idr_lock: protect the connection identifier set - * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance - * @rcvbuf_cache: memory cache of server receive buffer - * @rcv_wq: receive workqueue - * @send_wq: send workqueue - * @max_rcvbuf_size: maximum permitted receive message length - * @tipc_conn_new: callback will be called when new connection is incoming - * @tipc_conn_release: callback will be called before releasing the connection - * @tipc_conn_recvmsg: callback will be called when message arrives - * @saddr: TIPC server address - * @name: server name - * @imp: message importance - * @type: socket type - */ -struct tipc_server { - struct idr conn_idr; - spinlock_t idr_lock; - int idr_in_use; - struct net *net; - struct workqueue_struct *rcv_wq; - struct workqueue_struct *send_wq; - int max_rcvbuf_size; - struct sockaddr_tipc *saddr; - char name[TIPC_SERVER_NAME_LEN]; -}; - -void tipc_conn_queue_evt(struct tipc_server *s, int conid, +void tipc_conn_queue_evt(struct net *net, int conid, u32 event, struct tipc_event *evt); bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, u32 upper, u32 filter, int *conid); void tipc_topsrv_kern_unsubscr(struct net *net, int conid); -int tipc_server_start(struct tipc_server *s); - -void tipc_server_stop(struct tipc_server *s); - #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 3be1e4b6cbfa..c8146568d04e 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -51,7 +51,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, tipc_evt_write(evt, found_upper, found_upper); tipc_evt_write(evt, port.ref, port); tipc_evt_write(evt, port.node, node); - tipc_conn_queue_evt(sub->server, sub->conid, event, evt); + tipc_conn_queue_evt(sub->net, sub->conid, event, evt); } /** @@ -114,14 +114,7 @@ static void tipc_sub_timeout(struct timer_list *t) static void tipc_sub_kref_release(struct kref *kref) { - struct tipc_subscription *sub; - struct tipc_net *tn; - - sub = container_of(kref, struct tipc_subscription, kref); - tn = tipc_net(sub->server->net); - - atomic_dec(&tn->subscription_count); - kfree(sub); + kfree(container_of(kref, struct tipc_subscription, kref)); } void tipc_sub_put(struct tipc_subscription *subscription) @@ -134,19 +127,14 @@ void tipc_sub_get(struct tipc_subscription *subscription) kref_get(&subscription->kref); } -struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, +struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid) { - struct tipc_net *tn = tipc_net(srv->net); u32 filter = tipc_sub_read(s, filter); struct tipc_subscription *sub; u32 timeout; - if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { - pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); - return NULL; - } if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) || (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) { pr_warn("Subscription rejected, illegal request\n"); @@ -157,12 +145,11 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, pr_warn("Subscription rejected, no memory\n"); return NULL; } - sub->server = srv; + sub->net = net; sub->conid = conid; sub->inactive = false; memcpy(&sub->evt.s, s, sizeof(*s)); spin_lock_init(&sub->lock); - atomic_inc(&tn->subscription_count); kref_init(&sub->kref); tipc_nametbl_subscribe(sub); timer_setup(&sub->timer, tipc_sub_timeout, 0); @@ -180,46 +167,3 @@ void tipc_sub_unsubscribe(struct tipc_subscription *sub) list_del(&sub->sub_list); tipc_sub_put(sub); } - -int tipc_topsrv_start(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - const char name[] = "topology_server"; - struct sockaddr_tipc *saddr; - struct tipc_server *topsrv; - - saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC); - if (!saddr) - return -ENOMEM; - saddr->family = AF_TIPC; - saddr->addrtype = TIPC_ADDR_NAMESEQ; - saddr->addr.nameseq.type = TIPC_TOP_SRV; - saddr->addr.nameseq.lower = TIPC_TOP_SRV; - saddr->addr.nameseq.upper = TIPC_TOP_SRV; - saddr->scope = TIPC_NODE_SCOPE; - - topsrv = kzalloc(sizeof(*topsrv), GFP_ATOMIC); - if (!topsrv) { - kfree(saddr); - return -ENOMEM; - } - topsrv->net = net; - topsrv->saddr = saddr; - topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); - - strncpy(topsrv->name, name, strlen(name) + 1); - tn->topsrv = topsrv; - atomic_set(&tn->subscription_count, 0); - - return tipc_server_start(topsrv); -} - -void tipc_topsrv_stop(struct net *net) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_server *topsrv = tn->topsrv; - - tipc_server_stop(topsrv); - kfree(topsrv->saddr); - kfree(topsrv); -} diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 720932896ba6..82ba61afe638 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -56,7 +56,7 @@ struct tipc_conn; */ struct tipc_subscription { struct kref kref; - struct tipc_server *server; + struct net *net; struct timer_list timer; struct list_head nameseq_list; struct list_head sub_list; @@ -66,7 +66,7 @@ struct tipc_subscription { spinlock_t lock; /* serialize up/down and timer events */ }; -struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv, +struct tipc_subscription *tipc_sub_subscribe(struct net *net, struct tipc_subscr *s, int conid); void tipc_sub_unsubscribe(struct tipc_subscription *sub); -- cgit v1.2.3-70-g09d2 From 0ef897be12b8b4cf297b6016e79ec97ec90f2cf6 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:50 +0100 Subject: tipc: separate topology server listener socket from subcsriber sockets We move the listener socket to struct tipc_server and give it its own work item. This makes it easier to follow the code, and entails some simplifications in the reception code in subscriber sockets. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/server.c | 328 ++++++++++++++++++++++++------------------------------ 1 file changed, 147 insertions(+), 181 deletions(-) (limited to 'net') diff --git a/net/tipc/server.c b/net/tipc/server.c index 0abbdd698662..0e351e81690e 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -64,7 +64,6 @@ * @tipc_conn_new: callback will be called when new connection is incoming * @tipc_conn_release: callback will be called before releasing the connection * @tipc_conn_recvmsg: callback will be called when message arrives - * @saddr: TIPC server address * @name: server name * @imp: message importance * @type: socket type @@ -74,10 +73,11 @@ struct tipc_server { spinlock_t idr_lock; /* for idr list */ int idr_in_use; struct net *net; + struct work_struct awork; struct workqueue_struct *rcv_wq; struct workqueue_struct *send_wq; int max_rcvbuf_size; - struct sockaddr_tipc *saddr; + struct socket *listener; char name[TIPC_SERVER_NAME_LEN]; }; @@ -106,7 +106,6 @@ struct tipc_conn { struct list_head sub_list; spinlock_t sub_lock; /* for subscription list */ struct work_struct rwork; - int (*rx_action) (struct tipc_conn *con); struct list_head outqueue; spinlock_t outqueue_lock; struct work_struct swork; @@ -121,12 +120,6 @@ struct outqueue_entry { static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); -static void tipc_clean_outqueues(struct tipc_conn *con); - -static struct tipc_conn *sock2con(struct sock *sk) -{ - return sk->sk_user_data; -} static bool connected(struct tipc_conn *con) { @@ -137,21 +130,21 @@ static void tipc_conn_kref_release(struct kref *kref) { struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); struct tipc_server *s = con->server; - struct socket *sock = con->sock; + struct outqueue_entry *e, *safe; - if (sock) { - if (test_bit(CF_SERVER, &con->flags)) { - __module_get(sock->ops->owner); - __module_get(sock->sk->sk_prot_creator->owner); - } - sock_release(sock); - con->sock = NULL; - } spin_lock_bh(&s->idr_lock); idr_remove(&s->conn_idr, con->conid); s->idr_in_use--; spin_unlock_bh(&s->idr_lock); - tipc_clean_outqueues(con); + if (con->sock) + sock_release(con->sock); + + spin_lock_bh(&con->outqueue_lock); + list_for_each_entry_safe(e, safe, &con->outqueue, list) { + list_del(&e->list); + kfree(e); + } + spin_unlock_bh(&con->outqueue_lock); kfree(con); } @@ -178,14 +171,14 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) } /* sock_data_ready - interrupt callback indicating the socket has data to read - * The queued job is launched in tipc_recv_from_sock() + * The queued work is launched into tipc_recv_work()->tipc_recv_from_sock() */ static void sock_data_ready(struct sock *sk) { struct tipc_conn *con; read_lock_bh(&sk->sk_callback_lock); - con = sock2con(sk); + con = sk->sk_user_data; if (connected(con)) { conn_get(con); if (!queue_work(con->server->rcv_wq, &con->rwork)) @@ -195,15 +188,15 @@ static void sock_data_ready(struct sock *sk) } /* sock_write_space - interrupt callback after a sendmsg EAGAIN - * Indicates that there now is more is space in the send buffer - * The queued job is launched in tipc_send_to_sock() + * Indicates that there now is more space in the send buffer + * The queued work is launched into tipc_send_work()->tipc_send_to_sock() */ static void sock_write_space(struct sock *sk) { struct tipc_conn *con; read_lock_bh(&sk->sk_callback_lock); - con = sock2con(sk); + con = sk->sk_user_data; if (connected(con)) { conn_get(con); if (!queue_work(con->server->send_wq, &con->swork)) @@ -212,23 +205,8 @@ static void sock_write_space(struct sock *sk) read_unlock_bh(&sk->sk_callback_lock); } -static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) -{ - struct sock *sk = sock->sk; - - write_lock_bh(&sk->sk_callback_lock); - - sk->sk_data_ready = sock_data_ready; - sk->sk_write_space = sock_write_space; - sk->sk_user_data = con; - - con->sock = sock; - - write_unlock_bh(&sk->sk_callback_lock); -} - /* tipc_con_delete_sub - delete a specific or all subscriptions - * for a given subscriber + * for a given subscriber */ static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) { @@ -268,6 +246,7 @@ static void tipc_close_conn(struct tipc_conn *con) /* Don't flush pending works, -just let them expire */ kernel_sock_shutdown(con->sock, SHUT_RDWR); + conn_put(con); } @@ -357,117 +336,8 @@ static int tipc_receive_from_sock(struct tipc_conn *con) return ret; } -static int tipc_accept_from_sock(struct tipc_conn *con) -{ - struct socket *sock = con->sock; - struct socket *newsock; - struct tipc_conn *newcon; - int ret; - - ret = kernel_accept(sock, &newsock, O_NONBLOCK); - if (ret < 0) - return ret; - - newcon = tipc_alloc_conn(con->server); - if (IS_ERR(newcon)) { - ret = PTR_ERR(newcon); - sock_release(newsock); - return ret; - } - - newcon->rx_action = tipc_receive_from_sock; - tipc_register_callbacks(newsock, newcon); - - /* Wake up receive process in case of 'SYN+' message */ - newsock->sk->sk_data_ready(newsock->sk); - return ret; -} - -static struct socket *tipc_create_listen_sock(struct tipc_conn *con) -{ - struct tipc_server *s = con->server; - struct socket *sock = NULL; - int imp = TIPC_CRITICAL_IMPORTANCE; - int ret; - - ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock); - if (ret < 0) - return NULL; - ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); - if (ret < 0) - goto create_err; - ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr)); - if (ret < 0) - goto create_err; - - con->rx_action = tipc_accept_from_sock; - ret = kernel_listen(sock, 0); - if (ret < 0) - goto create_err; - - /* As server's listening socket owner and creator is the same module, - * we have to decrease TIPC module reference count to guarantee that - * it remains zero after the server socket is created, otherwise, - * executing "rmmod" command is unable to make TIPC module deleted - * after TIPC module is inserted successfully. - * - * However, the reference count is ever increased twice in - * sock_create_kern(): one is to increase the reference count of owner - * of TIPC socket's proto_ops struct; another is to increment the - * reference count of owner of TIPC proto struct. Therefore, we must - * decrement the module reference count twice to ensure that it keeps - * zero after server's listening socket is created. Of course, we - * must bump the module reference count twice as well before the socket - * is closed. - */ - module_put(sock->ops->owner); - module_put(sock->sk->sk_prot_creator->owner); - set_bit(CF_SERVER, &con->flags); - - return sock; - -create_err: - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); - return NULL; -} - -static int tipc_open_listening_sock(struct tipc_server *s) -{ - struct socket *sock; - struct tipc_conn *con; - - con = tipc_alloc_conn(s); - if (IS_ERR(con)) - return PTR_ERR(con); - - sock = tipc_create_listen_sock(con); - if (!sock) { - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - kfree(con); - return -EINVAL; - } - - tipc_register_callbacks(sock, con); - return 0; -} - -static void tipc_clean_outqueues(struct tipc_conn *con) -{ - struct outqueue_entry *e, *safe; - - spin_lock_bh(&con->outqueue_lock); - list_for_each_entry_safe(e, safe, &con->outqueue, list) { - list_del(&e->list); - kfree(e); - } - spin_unlock_bh(&con->outqueue_lock); -} - -/* tipc_conn_queue_evt - interrupt level call from a subscription instance - * The queued job is launched in tipc_send_to_sock() +/* tipc_conn_queue_evt() - interrupt level call from a subscription instance + * The queued work is launched into tipc_send_work()->tipc_send_to_sock() */ void tipc_conn_queue_evt(struct net *net, int conid, u32 event, struct tipc_event *evt) @@ -588,9 +458,9 @@ static void tipc_send_to_sock(struct tipc_conn *con) 1, sizeof(*evt)); if (ret == -EWOULDBLOCK || ret == 0) { cond_resched(); - goto out; + return; } else if (ret < 0) { - goto err; + return tipc_close_conn(con); } } else { tipc_send_kern_top_evt(srv->net, evt); @@ -606,10 +476,6 @@ static void tipc_send_to_sock(struct tipc_conn *con) kfree(e); } spin_unlock_bh(&con->outqueue_lock); -out: - return; -err: - tipc_close_conn(con); } static void tipc_recv_work(struct work_struct *work) @@ -618,7 +484,7 @@ static void tipc_recv_work(struct work_struct *work) int count = 0; while (connected(con)) { - if (con->rx_action(con)) + if (tipc_receive_from_sock(con)) break; /* Don't flood Rx machine */ @@ -640,10 +506,113 @@ static void tipc_send_work(struct work_struct *work) conn_put(con); } -static void tipc_work_stop(struct tipc_server *s) +static void tipc_accept_from_sock(struct work_struct *work) { - destroy_workqueue(s->rcv_wq); - destroy_workqueue(s->send_wq); + struct tipc_server *srv = container_of(work, struct tipc_server, awork); + struct socket *lsock = srv->listener; + struct socket *newsock; + struct tipc_conn *con; + struct sock *newsk; + int ret; + + while (1) { + ret = kernel_accept(lsock, &newsock, O_NONBLOCK); + if (ret < 0) + return; + con = tipc_alloc_conn(srv); + if (IS_ERR(con)) { + ret = PTR_ERR(con); + sock_release(newsock); + return; + } + /* Register callbacks */ + newsk = newsock->sk; + write_lock_bh(&newsk->sk_callback_lock); + newsk->sk_data_ready = sock_data_ready; + newsk->sk_write_space = sock_write_space; + newsk->sk_user_data = con; + con->sock = newsock; + write_unlock_bh(&newsk->sk_callback_lock); + + /* Wake up receive process in case of 'SYN+' message */ + newsk->sk_data_ready(newsk); + } +} + +/* listener_sock_data_ready - interrupt callback indicating new connection + * The queued job is launched into tipc_accept_from_sock() + */ +static void listener_sock_data_ready(struct sock *sk) +{ + struct tipc_server *srv; + + read_lock_bh(&sk->sk_callback_lock); + srv = sk->sk_user_data; + if (srv->listener) + queue_work(srv->rcv_wq, &srv->awork); + read_unlock_bh(&sk->sk_callback_lock); +} + +static int tipc_create_listener_sock(struct tipc_server *srv) +{ + int imp = TIPC_CRITICAL_IMPORTANCE; + struct socket *lsock = NULL; + struct sockaddr_tipc saddr; + struct sock *sk; + int rc; + + rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock); + if (rc < 0) + return rc; + + srv->listener = lsock; + sk = lsock->sk; + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = listener_sock_data_ready; + sk->sk_user_data = srv; + write_unlock_bh(&sk->sk_callback_lock); + + rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, + (char *)&imp, sizeof(imp)); + if (rc < 0) + goto err; + + saddr.family = AF_TIPC; + saddr.addrtype = TIPC_ADDR_NAMESEQ; + saddr.addr.nameseq.type = TIPC_TOP_SRV; + saddr.addr.nameseq.lower = TIPC_TOP_SRV; + saddr.addr.nameseq.upper = TIPC_TOP_SRV; + saddr.scope = TIPC_NODE_SCOPE; + + rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); + if (rc < 0) + goto err; + rc = kernel_listen(lsock, 0); + if (rc < 0) + goto err; + + /* As server's listening socket owner and creator is the same module, + * we have to decrease TIPC module reference count to guarantee that + * it remains zero after the server socket is created, otherwise, + * executing "rmmod" command is unable to make TIPC module deleted + * after TIPC module is inserted successfully. + * + * However, the reference count is ever increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of TIPC socket's proto_ops struct; another is to increment the + * reference count of owner of TIPC proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(lsock->ops->owner); + module_put(sk->sk_prot_creator->owner); + + return 0; +err: + sock_release(lsock); + return -EINVAL; } static int tipc_work_start(struct tipc_server *s) @@ -664,32 +633,26 @@ static int tipc_work_start(struct tipc_server *s) return 0; } +static void tipc_work_stop(struct tipc_server *s) +{ + destroy_workqueue(s->rcv_wq); + destroy_workqueue(s->send_wq); +} + int tipc_topsrv_start(struct net *net) { struct tipc_net *tn = tipc_net(net); const char name[] = "topology_server"; - struct sockaddr_tipc *saddr; struct tipc_server *srv; int ret; - saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC); - if (!saddr) - return -ENOMEM; - saddr->family = AF_TIPC; - saddr->addrtype = TIPC_ADDR_NAMESEQ; - saddr->addr.nameseq.type = TIPC_TOP_SRV; - saddr->addr.nameseq.lower = TIPC_TOP_SRV; - saddr->addr.nameseq.upper = TIPC_TOP_SRV; - saddr->scope = TIPC_NODE_SCOPE; - srv = kzalloc(sizeof(*srv), GFP_ATOMIC); - if (!srv) { - kfree(saddr); + if (!srv) return -ENOMEM; - } - srv->net = net; - srv->saddr = saddr; - srv->max_rcvbuf_size = sizeof(struct tipc_subscr); + + srv->net = net; + srv->max_rcvbuf_size = sizeof(struct tipc_subscr); + INIT_WORK(&srv->awork, tipc_accept_from_sock); strncpy(srv->name, name, strlen(name) + 1); tn->topsrv = srv; @@ -703,7 +666,7 @@ int tipc_topsrv_start(struct net *net) if (ret < 0) return ret; - ret = tipc_open_listening_sock(srv); + ret = tipc_create_listener_sock(srv); if (ret < 0) tipc_work_stop(srv); @@ -713,6 +676,7 @@ int tipc_topsrv_start(struct net *net) void tipc_topsrv_stop(struct net *net) { struct tipc_server *srv = tipc_topsrv(net); + struct socket *lsock = srv->listener; struct tipc_conn *con; int id; @@ -725,10 +689,12 @@ void tipc_topsrv_stop(struct net *net) spin_lock_bh(&srv->idr_lock); } } + __module_get(lsock->ops->owner); + __module_get(lsock->sk->sk_prot_creator->owner); + sock_release(lsock); + srv->listener = NULL; spin_unlock_bh(&srv->idr_lock); - tipc_work_stop(srv); idr_destroy(&srv->conn_idr); - kfree(srv->saddr); kfree(srv); } -- cgit v1.2.3-70-g09d2 From 026321c6d056a54b4145522492245d2b5913ee1d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Feb 2018 10:40:51 +0100 Subject: tipc: rename tipc_server to tipc_topsrv We rename struct tipc_server to struct tipc_topsrv. This reflect its now specialized role as topology server. Accoringly, we change or add function prefixes to make it clearer which functionality those belong to. There are no functional changes in this commit. Acked-by: Ying.Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/Makefile | 2 +- net/tipc/core.h | 6 +- net/tipc/group.c | 2 +- net/tipc/server.c | 700 ----------------------------------------------------- net/tipc/server.h | 57 ----- net/tipc/subscr.c | 2 +- net/tipc/subscr.h | 2 +- net/tipc/topsrv.c | 702 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/tipc/topsrv.h | 54 +++++ 9 files changed, 763 insertions(+), 764 deletions(-) delete mode 100644 net/tipc/server.c delete mode 100644 net/tipc/server.h create mode 100644 net/tipc/topsrv.c create mode 100644 net/tipc/topsrv.h (limited to 'net') diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 37bb0bfbd936..1edb7192aa2f 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -9,7 +9,7 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - server.o socket.o group.o + topsrv.o socket.o group.o tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/core.h b/net/tipc/core.h index 20b21af2ff14..ff8b071654f5 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -64,7 +64,7 @@ struct tipc_bearer; struct tipc_bc_base; struct tipc_link; struct tipc_name_table; -struct tipc_server; +struct tipc_topsrv; struct tipc_monitor; #define TIPC_MOD_VER "2.0.0" @@ -112,7 +112,7 @@ struct tipc_net { struct list_head dist_queue; /* Topology subscription server */ - struct tipc_server *topsrv; + struct tipc_topsrv *topsrv; atomic_t subscription_count; }; @@ -131,7 +131,7 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } -static inline struct tipc_server *tipc_topsrv(struct net *net) +static inline struct tipc_topsrv *tipc_topsrv(struct net *net) { return tipc_net(net)->topsrv; } diff --git a/net/tipc/group.c b/net/tipc/group.c index 122162a31816..03086ccb7746 100644 --- a/net/tipc/group.c +++ b/net/tipc/group.c @@ -37,7 +37,7 @@ #include "addr.h" #include "group.h" #include "bcast.h" -#include "server.h" +#include "topsrv.h" #include "msg.h" #include "socket.h" #include "node.h" diff --git a/net/tipc/server.c b/net/tipc/server.c deleted file mode 100644 index 0e351e81690e..000000000000 --- a/net/tipc/server.c +++ /dev/null @@ -1,700 +0,0 @@ -/* - * net/tipc/server.c: TIPC server infrastructure - * - * Copyright (c) 2012-2013, Wind River Systems - * Copyright (c) 2017, Ericsson AB - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "subscr.h" -#include "server.h" -#include "core.h" -#include "socket.h" -#include "addr.h" -#include "msg.h" -#include -#include - -/* Number of messages to send before rescheduling */ -#define MAX_SEND_MSG_COUNT 25 -#define MAX_RECV_MSG_COUNT 25 -#define CF_CONNECTED 1 -#define CF_SERVER 2 - -#define TIPC_SERVER_NAME_LEN 32 - -/** - * struct tipc_server - TIPC server structure - * @conn_idr: identifier set of connection - * @idr_lock: protect the connection identifier set - * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance - * @rcvbuf_cache: memory cache of server receive buffer - * @rcv_wq: receive workqueue - * @send_wq: send workqueue - * @max_rcvbuf_size: maximum permitted receive message length - * @tipc_conn_new: callback will be called when new connection is incoming - * @tipc_conn_release: callback will be called before releasing the connection - * @tipc_conn_recvmsg: callback will be called when message arrives - * @name: server name - * @imp: message importance - * @type: socket type - */ -struct tipc_server { - struct idr conn_idr; - spinlock_t idr_lock; /* for idr list */ - int idr_in_use; - struct net *net; - struct work_struct awork; - struct workqueue_struct *rcv_wq; - struct workqueue_struct *send_wq; - int max_rcvbuf_size; - struct socket *listener; - char name[TIPC_SERVER_NAME_LEN]; -}; - -/** - * struct tipc_conn - TIPC connection structure - * @kref: reference counter to connection object - * @conid: connection identifier - * @sock: socket handler associated with connection - * @flags: indicates connection state - * @server: pointer to connected server - * @sub_list: lsit to all pertaing subscriptions - * @sub_lock: lock protecting the subscription list - * @outqueue_lock: control access to the outqueue - * @rwork: receive work item - * @rx_action: what to do when connection socket is active - * @outqueue: pointer to first outbound message in queue - * @outqueue_lock: control access to the outqueue - * @swork: send work item - */ -struct tipc_conn { - struct kref kref; - int conid; - struct socket *sock; - unsigned long flags; - struct tipc_server *server; - struct list_head sub_list; - spinlock_t sub_lock; /* for subscription list */ - struct work_struct rwork; - struct list_head outqueue; - spinlock_t outqueue_lock; - struct work_struct swork; -}; - -/* An entry waiting to be sent */ -struct outqueue_entry { - bool inactive; - struct tipc_event evt; - struct list_head list; -}; - -static void tipc_recv_work(struct work_struct *work); -static void tipc_send_work(struct work_struct *work); - -static bool connected(struct tipc_conn *con) -{ - return con && test_bit(CF_CONNECTED, &con->flags); -} - -static void tipc_conn_kref_release(struct kref *kref) -{ - struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); - struct tipc_server *s = con->server; - struct outqueue_entry *e, *safe; - - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); - if (con->sock) - sock_release(con->sock); - - spin_lock_bh(&con->outqueue_lock); - list_for_each_entry_safe(e, safe, &con->outqueue, list) { - list_del(&e->list); - kfree(e); - } - spin_unlock_bh(&con->outqueue_lock); - kfree(con); -} - -static void conn_put(struct tipc_conn *con) -{ - kref_put(&con->kref, tipc_conn_kref_release); -} - -static void conn_get(struct tipc_conn *con) -{ - kref_get(&con->kref); -} - -static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid) -{ - struct tipc_conn *con; - - spin_lock_bh(&s->idr_lock); - con = idr_find(&s->conn_idr, conid); - if (!connected(con) || !kref_get_unless_zero(&con->kref)) - con = NULL; - spin_unlock_bh(&s->idr_lock); - return con; -} - -/* sock_data_ready - interrupt callback indicating the socket has data to read - * The queued work is launched into tipc_recv_work()->tipc_recv_from_sock() - */ -static void sock_data_ready(struct sock *sk) -{ - struct tipc_conn *con; - - read_lock_bh(&sk->sk_callback_lock); - con = sk->sk_user_data; - if (connected(con)) { - conn_get(con); - if (!queue_work(con->server->rcv_wq, &con->rwork)) - conn_put(con); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -/* sock_write_space - interrupt callback after a sendmsg EAGAIN - * Indicates that there now is more space in the send buffer - * The queued work is launched into tipc_send_work()->tipc_send_to_sock() - */ -static void sock_write_space(struct sock *sk) -{ - struct tipc_conn *con; - - read_lock_bh(&sk->sk_callback_lock); - con = sk->sk_user_data; - if (connected(con)) { - conn_get(con); - if (!queue_work(con->server->send_wq, &con->swork)) - conn_put(con); - } - read_unlock_bh(&sk->sk_callback_lock); -} - -/* tipc_con_delete_sub - delete a specific or all subscriptions - * for a given subscriber - */ -static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) -{ - struct list_head *sub_list = &con->sub_list; - struct tipc_net *tn = tipc_net(con->server->net); - struct tipc_subscription *sub, *tmp; - - spin_lock_bh(&con->sub_lock); - list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { - if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { - tipc_sub_unsubscribe(sub); - atomic_dec(&tn->subscription_count); - } else if (s) { - break; - } - } - spin_unlock_bh(&con->sub_lock); -} - -static void tipc_close_conn(struct tipc_conn *con) -{ - struct sock *sk = con->sock->sk; - bool disconnect = false; - - write_lock_bh(&sk->sk_callback_lock); - disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); - - if (disconnect) { - sk->sk_user_data = NULL; - tipc_con_delete_sub(con, NULL); - } - write_unlock_bh(&sk->sk_callback_lock); - - /* Handle concurrent calls from sending and receiving threads */ - if (!disconnect) - return; - - /* Don't flush pending works, -just let them expire */ - kernel_sock_shutdown(con->sock, SHUT_RDWR); - - conn_put(con); -} - -static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s) -{ - struct tipc_conn *con; - int ret; - - con = kzalloc(sizeof(struct tipc_conn), GFP_ATOMIC); - if (!con) - return ERR_PTR(-ENOMEM); - - kref_init(&con->kref); - INIT_LIST_HEAD(&con->outqueue); - INIT_LIST_HEAD(&con->sub_list); - spin_lock_init(&con->outqueue_lock); - spin_lock_init(&con->sub_lock); - INIT_WORK(&con->swork, tipc_send_work); - INIT_WORK(&con->rwork, tipc_recv_work); - - spin_lock_bh(&s->idr_lock); - ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC); - if (ret < 0) { - kfree(con); - spin_unlock_bh(&s->idr_lock); - return ERR_PTR(-ENOMEM); - } - con->conid = ret; - s->idr_in_use++; - spin_unlock_bh(&s->idr_lock); - - set_bit(CF_CONNECTED, &con->flags); - con->server = s; - - return con; -} - -static int tipc_con_rcv_sub(struct tipc_server *srv, - struct tipc_conn *con, - struct tipc_subscr *s) -{ - struct tipc_net *tn = tipc_net(srv->net); - struct tipc_subscription *sub; - - if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { - tipc_con_delete_sub(con, s); - return 0; - } - if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { - pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); - return -1; - } - sub = tipc_sub_subscribe(srv->net, s, con->conid); - if (!sub) - return -1; - atomic_inc(&tn->subscription_count); - spin_lock_bh(&con->sub_lock); - list_add(&sub->sub_list, &con->sub_list); - spin_unlock_bh(&con->sub_lock); - return 0; -} - -static int tipc_receive_from_sock(struct tipc_conn *con) -{ - struct tipc_server *srv = con->server; - struct sock *sk = con->sock->sk; - struct msghdr msg = {}; - struct tipc_subscr s; - struct kvec iov; - int ret; - - iov.iov_base = &s; - iov.iov_len = sizeof(s); - msg.msg_name = NULL; - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); - ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); - if (ret == -EWOULDBLOCK) - return -EWOULDBLOCK; - if (ret > 0) { - read_lock_bh(&sk->sk_callback_lock); - ret = tipc_con_rcv_sub(srv, con, &s); - read_unlock_bh(&sk->sk_callback_lock); - } - if (ret < 0) - tipc_close_conn(con); - - return ret; -} - -/* tipc_conn_queue_evt() - interrupt level call from a subscription instance - * The queued work is launched into tipc_send_work()->tipc_send_to_sock() - */ -void tipc_conn_queue_evt(struct net *net, int conid, - u32 event, struct tipc_event *evt) -{ - struct tipc_server *srv = tipc_topsrv(net); - struct outqueue_entry *e; - struct tipc_conn *con; - - con = tipc_conn_lookup(srv, conid); - if (!con) - return; - - if (!connected(con)) - goto err; - - e = kmalloc(sizeof(*e), GFP_ATOMIC); - if (!e) - goto err; - e->inactive = (event == TIPC_SUBSCR_TIMEOUT); - memcpy(&e->evt, evt, sizeof(*evt)); - spin_lock_bh(&con->outqueue_lock); - list_add_tail(&e->list, &con->outqueue); - spin_unlock_bh(&con->outqueue_lock); - - if (queue_work(srv->send_wq, &con->swork)) - return; -err: - conn_put(con); -} - -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, - u32 upper, u32 filter, int *conid) -{ - struct tipc_subscr sub; - struct tipc_conn *con; - int rc; - - sub.seq.type = type; - sub.seq.lower = lower; - sub.seq.upper = upper; - sub.timeout = TIPC_WAIT_FOREVER; - sub.filter = filter; - *(u32 *)&sub.usr_handle = port; - - con = tipc_alloc_conn(tipc_topsrv(net)); - if (IS_ERR(con)) - return false; - - *conid = con->conid; - con->sock = NULL; - rc = tipc_con_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc < 0) - tipc_close_conn(con); - return !rc; -} - -void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -{ - struct tipc_conn *con; - - con = tipc_conn_lookup(tipc_topsrv(net), conid); - if (!con) - return; - - test_and_clear_bit(CF_CONNECTED, &con->flags); - tipc_con_delete_sub(con, NULL); - conn_put(con); - conn_put(con); -} - -static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) -{ - u32 port = *(u32 *)&evt->s.usr_handle; - u32 self = tipc_own_addr(net); - struct sk_buff_head evtq; - struct sk_buff *skb; - - skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), - self, self, port, port, 0); - if (!skb) - return; - msg_set_dest_droppable(buf_msg(skb), true); - memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); - skb_queue_head_init(&evtq); - __skb_queue_tail(&evtq, skb); - tipc_sk_rcv(net, &evtq); -} - -static void tipc_send_to_sock(struct tipc_conn *con) -{ - struct list_head *queue = &con->outqueue; - struct tipc_server *srv = con->server; - struct outqueue_entry *e; - struct tipc_event *evt; - struct msghdr msg; - struct kvec iov; - int count = 0; - int ret; - - spin_lock_bh(&con->outqueue_lock); - - while (!list_empty(queue)) { - e = list_first_entry(queue, struct outqueue_entry, list); - evt = &e->evt; - spin_unlock_bh(&con->outqueue_lock); - - if (e->inactive) - tipc_con_delete_sub(con, &evt->s); - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; - iov.iov_base = evt; - iov.iov_len = sizeof(*evt); - msg.msg_name = NULL; - - if (con->sock) { - ret = kernel_sendmsg(con->sock, &msg, &iov, - 1, sizeof(*evt)); - if (ret == -EWOULDBLOCK || ret == 0) { - cond_resched(); - return; - } else if (ret < 0) { - return tipc_close_conn(con); - } - } else { - tipc_send_kern_top_evt(srv->net, evt); - } - - /* Don't starve users filling buffers */ - if (++count >= MAX_SEND_MSG_COUNT) { - cond_resched(); - count = 0; - } - spin_lock_bh(&con->outqueue_lock); - list_del(&e->list); - kfree(e); - } - spin_unlock_bh(&con->outqueue_lock); -} - -static void tipc_recv_work(struct work_struct *work) -{ - struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); - int count = 0; - - while (connected(con)) { - if (tipc_receive_from_sock(con)) - break; - - /* Don't flood Rx machine */ - if (++count >= MAX_RECV_MSG_COUNT) { - cond_resched(); - count = 0; - } - } - conn_put(con); -} - -static void tipc_send_work(struct work_struct *work) -{ - struct tipc_conn *con = container_of(work, struct tipc_conn, swork); - - if (connected(con)) - tipc_send_to_sock(con); - - conn_put(con); -} - -static void tipc_accept_from_sock(struct work_struct *work) -{ - struct tipc_server *srv = container_of(work, struct tipc_server, awork); - struct socket *lsock = srv->listener; - struct socket *newsock; - struct tipc_conn *con; - struct sock *newsk; - int ret; - - while (1) { - ret = kernel_accept(lsock, &newsock, O_NONBLOCK); - if (ret < 0) - return; - con = tipc_alloc_conn(srv); - if (IS_ERR(con)) { - ret = PTR_ERR(con); - sock_release(newsock); - return; - } - /* Register callbacks */ - newsk = newsock->sk; - write_lock_bh(&newsk->sk_callback_lock); - newsk->sk_data_ready = sock_data_ready; - newsk->sk_write_space = sock_write_space; - newsk->sk_user_data = con; - con->sock = newsock; - write_unlock_bh(&newsk->sk_callback_lock); - - /* Wake up receive process in case of 'SYN+' message */ - newsk->sk_data_ready(newsk); - } -} - -/* listener_sock_data_ready - interrupt callback indicating new connection - * The queued job is launched into tipc_accept_from_sock() - */ -static void listener_sock_data_ready(struct sock *sk) -{ - struct tipc_server *srv; - - read_lock_bh(&sk->sk_callback_lock); - srv = sk->sk_user_data; - if (srv->listener) - queue_work(srv->rcv_wq, &srv->awork); - read_unlock_bh(&sk->sk_callback_lock); -} - -static int tipc_create_listener_sock(struct tipc_server *srv) -{ - int imp = TIPC_CRITICAL_IMPORTANCE; - struct socket *lsock = NULL; - struct sockaddr_tipc saddr; - struct sock *sk; - int rc; - - rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock); - if (rc < 0) - return rc; - - srv->listener = lsock; - sk = lsock->sk; - write_lock_bh(&sk->sk_callback_lock); - sk->sk_data_ready = listener_sock_data_ready; - sk->sk_user_data = srv; - write_unlock_bh(&sk->sk_callback_lock); - - rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); - if (rc < 0) - goto err; - - saddr.family = AF_TIPC; - saddr.addrtype = TIPC_ADDR_NAMESEQ; - saddr.addr.nameseq.type = TIPC_TOP_SRV; - saddr.addr.nameseq.lower = TIPC_TOP_SRV; - saddr.addr.nameseq.upper = TIPC_TOP_SRV; - saddr.scope = TIPC_NODE_SCOPE; - - rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); - if (rc < 0) - goto err; - rc = kernel_listen(lsock, 0); - if (rc < 0) - goto err; - - /* As server's listening socket owner and creator is the same module, - * we have to decrease TIPC module reference count to guarantee that - * it remains zero after the server socket is created, otherwise, - * executing "rmmod" command is unable to make TIPC module deleted - * after TIPC module is inserted successfully. - * - * However, the reference count is ever increased twice in - * sock_create_kern(): one is to increase the reference count of owner - * of TIPC socket's proto_ops struct; another is to increment the - * reference count of owner of TIPC proto struct. Therefore, we must - * decrement the module reference count twice to ensure that it keeps - * zero after server's listening socket is created. Of course, we - * must bump the module reference count twice as well before the socket - * is closed. - */ - module_put(lsock->ops->owner); - module_put(sk->sk_prot_creator->owner); - - return 0; -err: - sock_release(lsock); - return -EINVAL; -} - -static int tipc_work_start(struct tipc_server *s) -{ - s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); - if (!s->rcv_wq) { - pr_err("can't start tipc receive workqueue\n"); - return -ENOMEM; - } - - s->send_wq = alloc_ordered_workqueue("tipc_send", 0); - if (!s->send_wq) { - pr_err("can't start tipc send workqueue\n"); - destroy_workqueue(s->rcv_wq); - return -ENOMEM; - } - - return 0; -} - -static void tipc_work_stop(struct tipc_server *s) -{ - destroy_workqueue(s->rcv_wq); - destroy_workqueue(s->send_wq); -} - -int tipc_topsrv_start(struct net *net) -{ - struct tipc_net *tn = tipc_net(net); - const char name[] = "topology_server"; - struct tipc_server *srv; - int ret; - - srv = kzalloc(sizeof(*srv), GFP_ATOMIC); - if (!srv) - return -ENOMEM; - - srv->net = net; - srv->max_rcvbuf_size = sizeof(struct tipc_subscr); - INIT_WORK(&srv->awork, tipc_accept_from_sock); - - strncpy(srv->name, name, strlen(name) + 1); - tn->topsrv = srv; - atomic_set(&tn->subscription_count, 0); - - spin_lock_init(&srv->idr_lock); - idr_init(&srv->conn_idr); - srv->idr_in_use = 0; - - ret = tipc_work_start(srv); - if (ret < 0) - return ret; - - ret = tipc_create_listener_sock(srv); - if (ret < 0) - tipc_work_stop(srv); - - return ret; -} - -void tipc_topsrv_stop(struct net *net) -{ - struct tipc_server *srv = tipc_topsrv(net); - struct socket *lsock = srv->listener; - struct tipc_conn *con; - int id; - - spin_lock_bh(&srv->idr_lock); - for (id = 0; srv->idr_in_use; id++) { - con = idr_find(&srv->conn_idr, id); - if (con) { - spin_unlock_bh(&srv->idr_lock); - tipc_close_conn(con); - spin_lock_bh(&srv->idr_lock); - } - } - __module_get(lsock->ops->owner); - __module_get(lsock->sk->sk_prot_creator->owner); - sock_release(lsock); - srv->listener = NULL; - spin_unlock_bh(&srv->idr_lock); - tipc_work_stop(srv); - idr_destroy(&srv->conn_idr); - kfree(srv); -} diff --git a/net/tipc/server.h b/net/tipc/server.h deleted file mode 100644 index ce93b6d68e41..000000000000 --- a/net/tipc/server.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * net/tipc/server.h: Include file for TIPC server code - * - * Copyright (c) 2012-2013, Wind River Systems - * Copyright (c) 2017, Ericsson AB - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TIPC_SERVER_H -#define _TIPC_SERVER_H - -#include "core.h" -#include -#include -#include - -#define TIPC_SERVER_NAME_LEN 32 -#define TIPC_SUB_CLUSTER_SCOPE 0x20 -#define TIPC_SUB_NODE_SCOPE 0x40 -#define TIPC_SUB_NO_STATUS 0x80 - -void tipc_conn_queue_evt(struct net *net, int conid, - u32 event, struct tipc_event *evt); - -bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, - u32 upper, u32 filter, int *conid); -void tipc_topsrv_kern_unsubscr(struct net *net, int conid); - -#endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index c8146568d04e..6925a989569b 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -51,7 +51,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub, tipc_evt_write(evt, found_upper, found_upper); tipc_evt_write(evt, port.ref, port); tipc_evt_write(evt, port.node, node); - tipc_conn_queue_evt(sub->net, sub->conid, event, evt); + tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt); } /** diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 82ba61afe638..8b2d22b18f22 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -37,7 +37,7 @@ #ifndef _TIPC_SUBSCR_H #define _TIPC_SUBSCR_H -#include "server.h" +#include "topsrv.h" #define TIPC_MAX_SUBSCR 65535 #define TIPC_MAX_PUBLICATIONS 65535 diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c new file mode 100644 index 000000000000..02013e00f287 --- /dev/null +++ b/net/tipc/topsrv.c @@ -0,0 +1,702 @@ +/* + * net/tipc/server.c: TIPC server infrastructure + * + * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017-2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "subscr.h" +#include "topsrv.h" +#include "core.h" +#include "socket.h" +#include "addr.h" +#include "msg.h" +#include +#include + +/* Number of messages to send before rescheduling */ +#define MAX_SEND_MSG_COUNT 25 +#define MAX_RECV_MSG_COUNT 25 +#define CF_CONNECTED 1 +#define CF_SERVER 2 + +#define TIPC_SERVER_NAME_LEN 32 + +/** + * struct tipc_topsrv - TIPC server structure + * @conn_idr: identifier set of connection + * @idr_lock: protect the connection identifier set + * @idr_in_use: amount of allocated identifier entry + * @net: network namspace instance + * @rcvbuf_cache: memory cache of server receive buffer + * @rcv_wq: receive workqueue + * @send_wq: send workqueue + * @max_rcvbuf_size: maximum permitted receive message length + * @tipc_conn_new: callback will be called when new connection is incoming + * @tipc_conn_release: callback will be called before releasing the connection + * @tipc_conn_recvmsg: callback will be called when message arrives + * @name: server name + * @imp: message importance + * @type: socket type + */ +struct tipc_topsrv { + struct idr conn_idr; + spinlock_t idr_lock; /* for idr list */ + int idr_in_use; + struct net *net; + struct work_struct awork; + struct workqueue_struct *rcv_wq; + struct workqueue_struct *send_wq; + int max_rcvbuf_size; + struct socket *listener; + char name[TIPC_SERVER_NAME_LEN]; +}; + +/** + * struct tipc_conn - TIPC connection structure + * @kref: reference counter to connection object + * @conid: connection identifier + * @sock: socket handler associated with connection + * @flags: indicates connection state + * @server: pointer to connected server + * @sub_list: lsit to all pertaing subscriptions + * @sub_lock: lock protecting the subscription list + * @outqueue_lock: control access to the outqueue + * @rwork: receive work item + * @rx_action: what to do when connection socket is active + * @outqueue: pointer to first outbound message in queue + * @outqueue_lock: control access to the outqueue + * @swork: send work item + */ +struct tipc_conn { + struct kref kref; + int conid; + struct socket *sock; + unsigned long flags; + struct tipc_topsrv *server; + struct list_head sub_list; + spinlock_t sub_lock; /* for subscription list */ + struct work_struct rwork; + struct list_head outqueue; + spinlock_t outqueue_lock; /* for outqueue */ + struct work_struct swork; +}; + +/* An entry waiting to be sent */ +struct outqueue_entry { + bool inactive; + struct tipc_event evt; + struct list_head list; +}; + +static void tipc_conn_recv_work(struct work_struct *work); +static void tipc_conn_send_work(struct work_struct *work); +static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt); +static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s); + +static bool connected(struct tipc_conn *con) +{ + return con && test_bit(CF_CONNECTED, &con->flags); +} + +static void tipc_conn_kref_release(struct kref *kref) +{ + struct tipc_conn *con = container_of(kref, struct tipc_conn, kref); + struct tipc_topsrv *s = con->server; + struct outqueue_entry *e, *safe; + + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); + if (con->sock) + sock_release(con->sock); + + spin_lock_bh(&con->outqueue_lock); + list_for_each_entry_safe(e, safe, &con->outqueue, list) { + list_del(&e->list); + kfree(e); + } + spin_unlock_bh(&con->outqueue_lock); + kfree(con); +} + +static void conn_put(struct tipc_conn *con) +{ + kref_put(&con->kref, tipc_conn_kref_release); +} + +static void conn_get(struct tipc_conn *con) +{ + kref_get(&con->kref); +} + +static void tipc_conn_close(struct tipc_conn *con) +{ + struct sock *sk = con->sock->sk; + bool disconnect = false; + + write_lock_bh(&sk->sk_callback_lock); + disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags); + + if (disconnect) { + sk->sk_user_data = NULL; + tipc_conn_delete_sub(con, NULL); + } + write_unlock_bh(&sk->sk_callback_lock); + + /* Handle concurrent calls from sending and receiving threads */ + if (!disconnect) + return; + + /* Don't flush pending works, -just let them expire */ + kernel_sock_shutdown(con->sock, SHUT_RDWR); + + conn_put(con); +} + +static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s) +{ + struct tipc_conn *con; + int ret; + + con = kzalloc(sizeof(*con), GFP_ATOMIC); + if (!con) + return ERR_PTR(-ENOMEM); + + kref_init(&con->kref); + INIT_LIST_HEAD(&con->outqueue); + INIT_LIST_HEAD(&con->sub_list); + spin_lock_init(&con->outqueue_lock); + spin_lock_init(&con->sub_lock); + INIT_WORK(&con->swork, tipc_conn_send_work); + INIT_WORK(&con->rwork, tipc_conn_recv_work); + + spin_lock_bh(&s->idr_lock); + ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC); + if (ret < 0) { + kfree(con); + spin_unlock_bh(&s->idr_lock); + return ERR_PTR(-ENOMEM); + } + con->conid = ret; + s->idr_in_use++; + spin_unlock_bh(&s->idr_lock); + + set_bit(CF_CONNECTED, &con->flags); + con->server = s; + + return con; +} + +static struct tipc_conn *tipc_conn_lookup(struct tipc_topsrv *s, int conid) +{ + struct tipc_conn *con; + + spin_lock_bh(&s->idr_lock); + con = idr_find(&s->conn_idr, conid); + if (!connected(con) || !kref_get_unless_zero(&con->kref)) + con = NULL; + spin_unlock_bh(&s->idr_lock); + return con; +} + +/* tipc_conn_delete_sub - delete a specific or all subscriptions + * for a given subscriber + */ +static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) +{ + struct tipc_net *tn = tipc_net(con->server->net); + struct list_head *sub_list = &con->sub_list; + struct tipc_subscription *sub, *tmp; + + spin_lock_bh(&con->sub_lock); + list_for_each_entry_safe(sub, tmp, sub_list, sub_list) { + if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { + tipc_sub_unsubscribe(sub); + atomic_dec(&tn->subscription_count); + } else if (s) { + break; + } + } + spin_unlock_bh(&con->sub_lock); +} + +static void tipc_conn_send_to_sock(struct tipc_conn *con) +{ + struct list_head *queue = &con->outqueue; + struct tipc_topsrv *srv = con->server; + struct outqueue_entry *e; + struct tipc_event *evt; + struct msghdr msg; + struct kvec iov; + int count = 0; + int ret; + + spin_lock_bh(&con->outqueue_lock); + + while (!list_empty(queue)) { + e = list_first_entry(queue, struct outqueue_entry, list); + evt = &e->evt; + spin_unlock_bh(&con->outqueue_lock); + + if (e->inactive) + tipc_conn_delete_sub(con, &evt->s); + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + iov.iov_base = evt; + iov.iov_len = sizeof(*evt); + msg.msg_name = NULL; + + if (con->sock) { + ret = kernel_sendmsg(con->sock, &msg, &iov, + 1, sizeof(*evt)); + if (ret == -EWOULDBLOCK || ret == 0) { + cond_resched(); + return; + } else if (ret < 0) { + return tipc_conn_close(con); + } + } else { + tipc_topsrv_kern_evt(srv->net, evt); + } + + /* Don't starve users filling buffers */ + if (++count >= MAX_SEND_MSG_COUNT) { + cond_resched(); + count = 0; + } + spin_lock_bh(&con->outqueue_lock); + list_del(&e->list); + kfree(e); + } + spin_unlock_bh(&con->outqueue_lock); +} + +static void tipc_conn_send_work(struct work_struct *work) +{ + struct tipc_conn *con = container_of(work, struct tipc_conn, swork); + + if (connected(con)) + tipc_conn_send_to_sock(con); + + conn_put(con); +} + +/* tipc_conn_queue_evt() - interrupt level call from a subscription instance + * The queued work is launched into tipc_send_work()->tipc_send_to_sock() + */ +void tipc_topsrv_queue_evt(struct net *net, int conid, + u32 event, struct tipc_event *evt) +{ + struct tipc_topsrv *srv = tipc_topsrv(net); + struct outqueue_entry *e; + struct tipc_conn *con; + + con = tipc_conn_lookup(srv, conid); + if (!con) + return; + + if (!connected(con)) + goto err; + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (!e) + goto err; + e->inactive = (event == TIPC_SUBSCR_TIMEOUT); + memcpy(&e->evt, evt, sizeof(*evt)); + spin_lock_bh(&con->outqueue_lock); + list_add_tail(&e->list, &con->outqueue); + spin_unlock_bh(&con->outqueue_lock); + + if (queue_work(srv->send_wq, &con->swork)) + return; +err: + conn_put(con); +} + +/* tipc_conn_write_space - interrupt callback after a sendmsg EAGAIN + * Indicates that there now is more space in the send buffer + * The queued work is launched into tipc_send_work()->tipc_conn_send_to_sock() + */ +static void tipc_conn_write_space(struct sock *sk) +{ + struct tipc_conn *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sk->sk_user_data; + if (connected(con)) { + conn_get(con); + if (!queue_work(con->server->send_wq, &con->swork)) + conn_put(con); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static int tipc_conn_rcv_sub(struct tipc_topsrv *srv, + struct tipc_conn *con, + struct tipc_subscr *s) +{ + struct tipc_net *tn = tipc_net(srv->net); + struct tipc_subscription *sub; + + if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { + tipc_conn_delete_sub(con, s); + return 0; + } + if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) { + pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR); + return -1; + } + sub = tipc_sub_subscribe(srv->net, s, con->conid); + if (!sub) + return -1; + atomic_inc(&tn->subscription_count); + spin_lock_bh(&con->sub_lock); + list_add(&sub->sub_list, &con->sub_list); + spin_unlock_bh(&con->sub_lock); + return 0; +} + +static int tipc_conn_rcv_from_sock(struct tipc_conn *con) +{ + struct tipc_topsrv *srv = con->server; + struct sock *sk = con->sock->sk; + struct msghdr msg = {}; + struct tipc_subscr s; + struct kvec iov; + int ret; + + iov.iov_base = &s; + iov.iov_len = sizeof(s); + msg.msg_name = NULL; + iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len); + ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT); + if (ret == -EWOULDBLOCK) + return -EWOULDBLOCK; + if (ret > 0) { + read_lock_bh(&sk->sk_callback_lock); + ret = tipc_conn_rcv_sub(srv, con, &s); + read_unlock_bh(&sk->sk_callback_lock); + } + if (ret < 0) + tipc_conn_close(con); + + return ret; +} + +static void tipc_conn_recv_work(struct work_struct *work) +{ + struct tipc_conn *con = container_of(work, struct tipc_conn, rwork); + int count = 0; + + while (connected(con)) { + if (tipc_conn_rcv_from_sock(con)) + break; + + /* Don't flood Rx machine */ + if (++count >= MAX_RECV_MSG_COUNT) { + cond_resched(); + count = 0; + } + } + conn_put(con); +} + +/* tipc_conn_data_ready - interrupt callback indicating the socket has data + * The queued work is launched into tipc_recv_work()->tipc_conn_rcv_from_sock() + */ +static void tipc_conn_data_ready(struct sock *sk) +{ + struct tipc_conn *con; + + read_lock_bh(&sk->sk_callback_lock); + con = sk->sk_user_data; + if (connected(con)) { + conn_get(con); + if (!queue_work(con->server->rcv_wq, &con->rwork)) + conn_put(con); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static void tipc_topsrv_accept(struct work_struct *work) +{ + struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork); + struct socket *lsock = srv->listener; + struct socket *newsock; + struct tipc_conn *con; + struct sock *newsk; + int ret; + + while (1) { + ret = kernel_accept(lsock, &newsock, O_NONBLOCK); + if (ret < 0) + return; + con = tipc_conn_alloc(srv); + if (IS_ERR(con)) { + ret = PTR_ERR(con); + sock_release(newsock); + return; + } + /* Register callbacks */ + newsk = newsock->sk; + write_lock_bh(&newsk->sk_callback_lock); + newsk->sk_data_ready = tipc_conn_data_ready; + newsk->sk_write_space = tipc_conn_write_space; + newsk->sk_user_data = con; + con->sock = newsock; + write_unlock_bh(&newsk->sk_callback_lock); + + /* Wake up receive process in case of 'SYN+' message */ + newsk->sk_data_ready(newsk); + } +} + +/* tipc_toprsv_listener_data_ready - interrupt callback with connection request + * The queued job is launched into tipc_topsrv_accept() + */ +static void tipc_topsrv_listener_data_ready(struct sock *sk) +{ + struct tipc_topsrv *srv; + + read_lock_bh(&sk->sk_callback_lock); + srv = sk->sk_user_data; + if (srv->listener) + queue_work(srv->rcv_wq, &srv->awork); + read_unlock_bh(&sk->sk_callback_lock); +} + +static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) +{ + int imp = TIPC_CRITICAL_IMPORTANCE; + struct socket *lsock = NULL; + struct sockaddr_tipc saddr; + struct sock *sk; + int rc; + + rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock); + if (rc < 0) + return rc; + + srv->listener = lsock; + sk = lsock->sk; + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = tipc_topsrv_listener_data_ready; + sk->sk_user_data = srv; + write_unlock_bh(&sk->sk_callback_lock); + + rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, + (char *)&imp, sizeof(imp)); + if (rc < 0) + goto err; + + saddr.family = AF_TIPC; + saddr.addrtype = TIPC_ADDR_NAMESEQ; + saddr.addr.nameseq.type = TIPC_TOP_SRV; + saddr.addr.nameseq.lower = TIPC_TOP_SRV; + saddr.addr.nameseq.upper = TIPC_TOP_SRV; + saddr.scope = TIPC_NODE_SCOPE; + + rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr)); + if (rc < 0) + goto err; + rc = kernel_listen(lsock, 0); + if (rc < 0) + goto err; + + /* As server's listening socket owner and creator is the same module, + * we have to decrease TIPC module reference count to guarantee that + * it remains zero after the server socket is created, otherwise, + * executing "rmmod" command is unable to make TIPC module deleted + * after TIPC module is inserted successfully. + * + * However, the reference count is ever increased twice in + * sock_create_kern(): one is to increase the reference count of owner + * of TIPC socket's proto_ops struct; another is to increment the + * reference count of owner of TIPC proto struct. Therefore, we must + * decrement the module reference count twice to ensure that it keeps + * zero after server's listening socket is created. Of course, we + * must bump the module reference count twice as well before the socket + * is closed. + */ + module_put(lsock->ops->owner); + module_put(sk->sk_prot_creator->owner); + + return 0; +err: + sock_release(lsock); + return -EINVAL; +} + +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid) +{ + struct tipc_subscr sub; + struct tipc_conn *con; + int rc; + + sub.seq.type = type; + sub.seq.lower = lower; + sub.seq.upper = upper; + sub.timeout = TIPC_WAIT_FOREVER; + sub.filter = filter; + *(u32 *)&sub.usr_handle = port; + + con = tipc_conn_alloc(tipc_topsrv(net)); + if (IS_ERR(con)) + return false; + + *conid = con->conid; + con->sock = NULL; + rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); + if (rc < 0) + tipc_conn_close(con); + return !rc; +} + +void tipc_topsrv_kern_unsubscr(struct net *net, int conid) +{ + struct tipc_conn *con; + + con = tipc_conn_lookup(tipc_topsrv(net), conid); + if (!con) + return; + + test_and_clear_bit(CF_CONNECTED, &con->flags); + tipc_conn_delete_sub(con, NULL); + conn_put(con); + conn_put(con); +} + +static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt) +{ + u32 port = *(u32 *)&evt->s.usr_handle; + u32 self = tipc_own_addr(net); + struct sk_buff_head evtq; + struct sk_buff *skb; + + skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), + self, self, port, port, 0); + if (!skb) + return; + msg_set_dest_droppable(buf_msg(skb), true); + memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); + skb_queue_head_init(&evtq); + __skb_queue_tail(&evtq, skb); + tipc_sk_rcv(net, &evtq); +} + +static int tipc_topsrv_work_start(struct tipc_topsrv *s) +{ + s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0); + if (!s->rcv_wq) { + pr_err("can't start tipc receive workqueue\n"); + return -ENOMEM; + } + + s->send_wq = alloc_ordered_workqueue("tipc_send", 0); + if (!s->send_wq) { + pr_err("can't start tipc send workqueue\n"); + destroy_workqueue(s->rcv_wq); + return -ENOMEM; + } + + return 0; +} + +static void tipc_topsrv_work_stop(struct tipc_topsrv *s) +{ + destroy_workqueue(s->rcv_wq); + destroy_workqueue(s->send_wq); +} + +int tipc_topsrv_start(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + const char name[] = "topology_server"; + struct tipc_topsrv *srv; + int ret; + + srv = kzalloc(sizeof(*srv), GFP_ATOMIC); + if (!srv) + return -ENOMEM; + + srv->net = net; + srv->max_rcvbuf_size = sizeof(struct tipc_subscr); + INIT_WORK(&srv->awork, tipc_topsrv_accept); + + strncpy(srv->name, name, strlen(name) + 1); + tn->topsrv = srv; + atomic_set(&tn->subscription_count, 0); + + spin_lock_init(&srv->idr_lock); + idr_init(&srv->conn_idr); + srv->idr_in_use = 0; + + ret = tipc_topsrv_work_start(srv); + if (ret < 0) + return ret; + + ret = tipc_topsrv_create_listener(srv); + if (ret < 0) + tipc_topsrv_work_stop(srv); + + return ret; +} + +void tipc_topsrv_stop(struct net *net) +{ + struct tipc_topsrv *srv = tipc_topsrv(net); + struct socket *lsock = srv->listener; + struct tipc_conn *con; + int id; + + spin_lock_bh(&srv->idr_lock); + for (id = 0; srv->idr_in_use; id++) { + con = idr_find(&srv->conn_idr, id); + if (con) { + spin_unlock_bh(&srv->idr_lock); + tipc_conn_close(con); + spin_lock_bh(&srv->idr_lock); + } + } + __module_get(lsock->ops->owner); + __module_get(lsock->sk->sk_prot_creator->owner); + sock_release(lsock); + srv->listener = NULL; + spin_unlock_bh(&srv->idr_lock); + tipc_topsrv_work_stop(srv); + idr_destroy(&srv->conn_idr); + kfree(srv); +} diff --git a/net/tipc/topsrv.h b/net/tipc/topsrv.h new file mode 100644 index 000000000000..c7ea71293748 --- /dev/null +++ b/net/tipc/topsrv.h @@ -0,0 +1,54 @@ +/* + * net/tipc/server.h: Include file for TIPC server code + * + * Copyright (c) 2012-2013, Wind River Systems + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_SERVER_H +#define _TIPC_SERVER_H + +#include "core.h" + +#define TIPC_SERVER_NAME_LEN 32 +#define TIPC_SUB_CLUSTER_SCOPE 0x20 +#define TIPC_SUB_NODE_SCOPE 0x40 +#define TIPC_SUB_NO_STATUS 0x80 + +void tipc_topsrv_queue_evt(struct net *net, int conid, + u32 event, struct tipc_event *evt); + +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, + u32 upper, u32 filter, int *conid); +void tipc_topsrv_kern_unsubscr(struct net *net, int conid); + +#endif -- cgit v1.2.3-70-g09d2 From 66dede2d6b2340235ca212532275446d7bb010fe Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 15 Feb 2018 15:50:57 +0100 Subject: net: sched: fix unbalance in the error path of tca_action_flush() When tca_action_flush() calls the action walk() and gets an error, a successful call to nla_nest_start() is not followed by a call to nla_nest_cancel(). It's harmless, as the skb is freed in the error path - but it's worth to fix this unbalance. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4886ea4a7d6e..624995564e5a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -938,8 +938,10 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); - if (err <= 0) + if (err <= 0) { + nla_nest_cancel(skb, nest); goto out_module_put; + } nla_nest_end(skb, nest); -- cgit v1.2.3-70-g09d2 From b7b347fa3cd496ad5b4cbcc8ea2931847c4d0d78 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:53 -0500 Subject: net: sched: act: fix code style This patch is used by subsequent patches. It fixes code style issues caught by checkpatch. Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 12 ++++++------ net/sched/act_mirred.c | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 6ed9692f20bd..32ef544f4ddc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -87,12 +87,13 @@ struct tc_action_ops { struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); - int (*lookup)(struct net *, struct tc_action **, u32); + int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind); int (*walk)(struct net *, struct sk_buff *, - struct netlink_callback *, int, const struct tc_action_ops *); + struct netlink_callback *, int, + const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); struct net_device *(*get_dev)(const struct tc_action *a); }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 624995564e5a..a32e6c2edbf6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -621,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; - if (kind == NULL) + if (!kind) goto err_out; if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; @@ -822,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref) < 0) @@ -934,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); @@ -1007,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { - if (tb[1] != NULL) + if (tb[1]) return tca_action_flush(net, tb[1], n, portid); - else - return -EINVAL; + + return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e6ff88f72900..abcd5f12b913 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, bool exists = false; int ret; - if (nla == NULL) + if (!nla) return -EINVAL; ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); if (ret < 0) return ret; - if (tb[TCA_MIRRED_PARMS] == NULL) + if (!tb[TCA_MIRRED_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_MIRRED_PARMS]); @@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (dev == NULL) + if (!dev) return -EINVAL; ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); -- cgit v1.2.3-70-g09d2 From 10defbd29e6218c1cab5c217a9d808fc05e3938a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:54 -0500 Subject: net: sched: act: add extack to init This patch adds extack to tcf_action_init and tcf_action_init_1 functions. These are necessary to make individual extack handling in each act implementation. Based on work by David Ahern Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 17 +++++++++++------ net/sched/cls_api.c | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 32ef544f4ddc..41d95930ffbc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -163,10 +163,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions); + struct list_head *actions, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind); + char *name, int ovr, int bind, + struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a32e6c2edbf6..662574646256 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -605,7 +605,8 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) + char *name, int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action *a; struct tc_action_ops *a_o; @@ -726,7 +727,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions) + struct list_head *actions, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -738,7 +739,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, + extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1062,12 +1064,14 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr) + struct nlmsghdr *n, u32 portid, int ovr, + struct netlink_ext_ack *extack) { int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + extack); if (ret) return ret; @@ -1115,7 +1119,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, + extack); if (ret == -EAGAIN) goto replay; break; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2bc1bc23d42e..f21610c5da1a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND); + TCA_ACT_BIND, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions); + &actions, extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3-70-g09d2 From ee99b2d8bf4ad6d03046a8c2f25bad7cfd9de64a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 16 Feb 2018 16:03:39 -0500 Subject: net: Revert sched action extack support series. It was mis-applied and the changes had rejects. Signed-off-by: David S. Miller --- include/net/act_api.h | 10 ++++------ net/sched/act_api.c | 29 ++++++++++++----------------- net/sched/act_mirred.c | 6 +++--- net/sched/cls_api.c | 4 ++-- 4 files changed, 21 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 41d95930ffbc..6ed9692f20bd 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -87,13 +87,12 @@ struct tc_action_ops { struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); - int (*lookup)(struct net *net, struct tc_action **a, u32 index); + int (*lookup)(struct net *, struct tc_action **, u32); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind); int (*walk)(struct net *, struct sk_buff *, - struct netlink_callback *, int, - const struct tc_action_ops *); + struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); struct net_device *(*get_dev)(const struct tc_action *a); }; @@ -163,11 +162,10 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, struct netlink_ext_ack *extack); + struct list_head *actions); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - struct netlink_ext_ack *extack); + char *name, int ovr, int bind); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 662574646256..624995564e5a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -605,8 +605,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, - struct netlink_ext_ack *extack) + char *name, int ovr, int bind) { struct tc_action *a; struct tc_action_ops *a_o; @@ -622,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; - if (!kind) + if (kind == NULL) goto err_out; if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; @@ -727,7 +726,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, struct netlink_ext_ack *extack) + struct list_head *actions) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -739,8 +738,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -824,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (!nest) + if (nest == NULL) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref) < 0) @@ -936,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (!nest) + if (nest == NULL) goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); @@ -1009,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { - if (tb[1]) + if (tb[1] != NULL) return tca_action_flush(net, tb[1], n, portid); - - return -EINVAL; + else + return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { @@ -1064,14 +1062,12 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr, - struct netlink_ext_ack *extack) + struct nlmsghdr *n, u32 portid, int ovr) { int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - extack); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); if (ret) return ret; @@ -1119,8 +1115,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, - extack); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); if (ret == -EAGAIN) goto replay; break; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index abcd5f12b913..e6ff88f72900 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, bool exists = false; int ret; - if (!nla) + if (nla == NULL) return -EINVAL; ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); if (ret < 0) return ret; - if (!tb[TCA_MIRRED_PARMS]) + if (tb[TCA_MIRRED_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_MIRRED_PARMS]); @@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (!dev) + if (dev == NULL) return -EINVAL; ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f21610c5da1a..2bc1bc23d42e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND); if (IS_ERR(act)) return PTR_ERR(act); @@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, extack); + &actions); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3-70-g09d2 From 6f89dbce8e1134458de8a8e376acaaca4eee602e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:32 -0800 Subject: skbuff: export mm_[un]account_pinned_pages for other modules RDS would like to use the helper functions for managing pinned pages added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ net/core/skbuff.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720..b1cc38af53e1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -466,6 +466,9 @@ struct ubuf_info { #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size); +void mm_unaccount_pinned_pages(struct mmpin *mmp); + struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71..1a7485a2cdfa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size) { unsigned long max_pg, num_pg, new_pg, old_pg; struct user_struct *user; @@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size) return 0; } +EXPORT_SYMBOL_GPL(mm_account_pinned_pages); -static void mm_unaccount_pinned_pages(struct mmpin *mmp) +void mm_unaccount_pinned_pages(struct mmpin *mmp) { if (mmp->user) { atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); free_uid(mmp->user); } } +EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) { -- cgit v1.2.3-70-g09d2 From ea8994cb0118993da179af836cc72a583d75dc4b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:33 -0800 Subject: rds: hold a sock ref from rds_message to the rds_sock The existing model holds a reference from the rds_sock to the rds_message, but the rds_message does not itself hold a sock_put() on the rds_sock. Instead the m_rs field in the rds_message is assigned when the message is queued on the sock, and nulled when the message is dequeued from the sock. We want to be able to notify userspace when the rds_message is actually freed (from rds_message_purge(), after the refcounts to the rds_message go to 0). At the time that rds_message_purge() is called, the message is no longer on the rds_sock retransmit queue. Thus the explicit reference for the m_rs is needed to send a notification that will signal to userspace that it is now safe to free/reuse any pages that may have been pinned down for zerocopy. This patch manages the m_rs assignment in the rds_message with the necessary refcount book-keeping. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/message.c | 8 +++++++- net/rds/send.c | 7 +------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4318cc9b78f7..ef3daafa3d79 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(rds_message_addref); */ static void rds_message_purge(struct rds_message *rm) { - unsigned long i; + unsigned long i, flags; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; @@ -69,6 +69,12 @@ static void rds_message_purge(struct rds_message *rm) __free_page(sg_page(&rm->data.op_sg[i])); } rm->data.op_nents = 0; + spin_lock_irqsave(&rm->m_rs_lock, flags); + if (rm->m_rs) { + sock_put(rds_rs_to_sk(rm->m_rs)); + rm->m_rs = NULL; + } + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); diff --git a/net/rds/send.c b/net/rds/send.c index b1b0022b8370..e8f3ff471b15 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) rm->rdma.op_notifier = NULL; } was_on_sock = 1; - rm->m_rs = NULL; } spin_unlock(&rs->rs_lock); @@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) */ if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { spin_unlock_irqrestore(&cp->cp_lock, flags); - spin_lock_irqsave(&rm->m_rs_lock, flags); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags); continue; } list_del_init(&rm->m_conn_item); @@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; spin_unlock_irqrestore(&rm->m_rs_lock, flags); rds_message_put(rm); @@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, list_add_tail(&rm->m_sock_item, &rs->rs_send_queue); set_bit(RDS_MSG_ON_SOCK, &rm->m_flags); rds_message_addref(rm); + sock_hold(rds_rs_to_sk(rs)); rm->m_rs = rs; /* The code ordering is a little weird, but we're -- cgit v1.2.3-70-g09d2 From 28190752c709272de3c2b6b092029da3f1614c5a Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:34 -0800 Subject: sock: permit SO_ZEROCOPY on PF_RDS socket allow the application to set SO_ZEROCOPY on the underlying sk of a PF_RDS socket Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/sock.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index e90d461748f0..a1fa4a548f1b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1049,18 +1049,21 @@ set_rcvbuf: break; case SO_ZEROCOPY: - if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) + if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { + if (sk->sk_protocol != IPPROTO_TCP) + ret = -ENOTSUPP; + else if (sk->sk_state != TCP_CLOSE) + ret = -EBUSY; + } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; - else if (sk->sk_protocol != IPPROTO_TCP) - ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; - else if (val < 0 || val > 1) - ret = -EINVAL; - else - sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); - break; - + } + if (!ret) { + if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + break; + } default: ret = -ENOPROTOOPT; break; -- cgit v1.2.3-70-g09d2 From 01883eda72bd3f0a6c81447e4f223de14033fd9d Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:35 -0800 Subject: rds: support for zcopy completion notification RDS removes a datagram (rds_message) from the retransmit queue when an ACK is received. The ACK indicates that the receiver has queued the RDS datagram, so that the sender can safely forget the datagram. When all references to the rds_message are quiesced, rds_message_purge is called to release resources used by the rds_message If the datagram to be removed had pinned pages set up, add an entry to the rs->rs_znotify_queue so that the notifcation will be sent up via rds_rm_zerocopy_callback() when the rds_message is eventually freed by rds_message_purge. rds_rm_zerocopy_callback() attempts to batch the number of cookies sent with each notification to a max of SO_EE_ORIGIN_MAX_ZCOOKIES. This is achieved by checking the tail skb in the sk_error_queue: if this has room for one more cookie, the cookie from the current notification is added; else a new skb is added to the sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will trigger a ->sk_error_report to notify the application. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 2 ++ net/rds/af_rds.c | 2 ++ net/rds/message.c | 83 +++++++++++++++++++++++++++++++++++++++---- net/rds/rds.h | 14 ++++++++ net/rds/recv.c | 2 ++ 5 files changed, 96 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..28812eda4209 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,11 +20,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 0a8eefd256b3..a937f18896ae 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ diff --git a/net/rds/message.c b/net/rds/message.c index ef3daafa3d79..bf1a656b198a 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include "rds.h" @@ -53,29 +56,95 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); +static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + int ncookies; + u32 *ptr; + + if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + return false; + ncookies = serr->ee.ee_data; + if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) + return false; + ptr = skb_put(skb, sizeof(u32)); + *ptr = cookie; + serr->ee.ee_data = ++ncookies; + return true; +} + +static void rds_rm_zerocopy_callback(struct rds_sock *rs, + struct rds_znotifier *znotif) +{ + struct sock *sk = rds_rs_to_sk(rs); + struct sk_buff *skb, *tail; + struct sock_exterr_skb *serr; + unsigned long flags; + struct sk_buff_head *q; + u32 cookie = znotif->z_cookie; + + q = &sk->sk_error_queue; + spin_lock_irqsave(&q->lock, flags); + tail = skb_peek_tail(q); + + if (tail && skb_zcookie_add(tail, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + mm_unaccount_pinned_pages(&znotif->z_mmp); + consume_skb(rds_skb_from_znotifier(znotif)); + sk->sk_error_report(sk); + return; + } + + skb = rds_skb_from_znotifier(znotif); + serr = SKB_EXT_ERR(skb); + memset(&serr->ee, 0, sizeof(serr->ee)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; + serr->ee.ee_info = 0; + WARN_ON(!skb_zcookie_add(skb, cookie)); + + __skb_queue_tail(q, skb); + + spin_unlock_irqrestore(&q->lock, flags); + sk->sk_error_report(sk); + + mm_unaccount_pinned_pages(&znotif->z_mmp); +} + /* * This relies on dma_map_sg() not touching sg[].page during merging. */ static void rds_message_purge(struct rds_message *rm) { unsigned long i, flags; + bool zcopy = false; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->data.op_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); - /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.op_sg[i])); - } - rm->data.op_nents = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); if (rm->m_rs) { - sock_put(rds_rs_to_sk(rm->m_rs)); + struct rds_sock *rs = rm->m_rs; + + if (rm->data.op_mmp_znotifier) { + zcopy = true; + rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rm->data.op_mmp_znotifier = NULL; + } + sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); + for (i = 0; i < rm->data.op_nents; i++) { + /* XXX will have to put_page for page refs */ + if (!zcopy) + __free_page(sg_page(&rm->data.op_sg[i])); + else + put_page(sg_page(&rm->data.op_sg[i])); + } + rm->data.op_nents = 0; + if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) diff --git a/net/rds/rds.h b/net/rds/rds.h index 7301b9b01890..24576bc4a5e9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 +struct rds_znotifier { + struct list_head z_list; + struct mmpin z_mmp; + u32 z_cookie; +}; + +#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0])) + +static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z) +{ + return container_of((void *)z, struct sk_buff, cb); +} + struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; @@ -436,6 +449,7 @@ struct rds_message { unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; + struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca..b080961464df 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ -- cgit v1.2.3-70-g09d2 From 0cebaccef3acbdfbc2d85880a2efb765d2f4e2e3 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:36 -0800 Subject: rds: zerocopy Tx support. If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and, if the SO_ZEROCOPY socket option has been set on the PF_RDS socket, application pages sent down with rds_sendmsg() are pinned. The pinning uses the accounting infrastructure added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") The payload bytes in the message may not be modified for the duration that the message has been pinned. A multi-threaded application using this infrastructure may thus need to be notified about send-completion so that it can free/reuse the buffers passed to rds_sendmsg(). Notification of send-completion will identify each message-buffer by a cookie that the application must specify as ancillary data to rds_sendmsg(). The ancillary data in this case has cmsg_level == SOL_RDS and cmsg_type == RDS_CMSG_ZCOPY_COOKIE. Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/rds.h | 1 + net/rds/message.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- net/rds/rds.h | 3 ++- net/rds/send.c | 44 +++++++++++++++++++++++++++++++++++------ 4 files changed, 91 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index e71d4491f225..12e3bca32cad 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -103,6 +103,7 @@ #define RDS_CMSG_MASKED_ATOMIC_FADD 8 #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 +#define RDS_CMSG_ZCOPY_COOKIE 12 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 diff --git a/net/rds/message.c b/net/rds/message.c index bf1a656b198a..651834513481 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -341,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) { unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; + int length = iov_iter_count(from); rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -356,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from) sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + if (zcopy) { + int total_copied = 0; + struct sk_buff *skb; + + skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), + GFP_KERNEL); + if (!skb) + return -ENOMEM; + rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; + goto err; + } + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; +err: + consume_skb(skb); + rm->data.op_mmp_znotifier = NULL; + return ret; + } /* zcopy */ + while (iov_iter_count(from)) { if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, iov_iter_count(from), diff --git a/net/rds/rds.h b/net/rds/rds.h index 24576bc4a5e9..31cd38852050 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -785,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from); +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); diff --git a/net/rds/send.c b/net/rds/send.c index e8f3ff471b15..028ab598ac1b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -875,12 +875,13 @@ out: * rds_message is getting to be quite complicated, and we'd like to allocate * it all in one go. This figures out how big it needs to be up front. */ -static int rds_rm_size(struct msghdr *msg, int data_len) +static int rds_rm_size(struct msghdr *msg, int num_sgs) { struct cmsghdr *cmsg; int size = 0; int cmsg_groups = 0; int retval; + bool zcopy_cookie = false; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -899,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len) break; + case RDS_CMSG_ZCOPY_COOKIE: + zcopy_cookie = true; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; @@ -919,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie) + return -EINVAL; + + size += num_sgs * sizeof(struct scatterlist); /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ if (cmsg_groups == 3) @@ -928,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + u32 *cookie; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) + return -EINVAL; + cookie = CMSG_DATA(cmsg); + rm->data.op_mmp_znotifier->z_cookie = *cookie; + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -970,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ZCOPY_COOKIE: + ret = rds_cmsg_zcopy(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -1040,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); struct rds_conn_path *cpath; size_t total_payload_len = payload_len, rdma_payload_len = 0; + bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && + sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); + int num_sgs = ceil(payload_len, PAGE_SIZE); /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ - if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) { ret = -EOPNOTSUPP; goto out; } @@ -1087,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (zcopy) { + if (rs->rs_transport->t_type != RDS_TRANS_TCP) { + ret = -EOPNOTSUPP; + goto out; + } + num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX); + } /* size of rm including all sgs */ - ret = rds_rm_size(msg, payload_len); + ret = rds_rm_size(msg, num_sgs); if (ret < 0) goto out; @@ -1100,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* Attach data to the rm */ if (payload_len) { - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); if (!rm->data.op_sg) { ret = -ENOMEM; goto out; } - ret = rds_message_copy_from_user(rm, &msg->msg_iter); + ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy); if (ret) goto out; } -- cgit v1.2.3-70-g09d2 From 1af85155813622767d223af6d4dff283ebeea7a7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:53 -0500 Subject: net: sched: act: fix code style This patch is used by subsequent patches. It fixes code style issues caught by checkpatch. Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 12 ++++++------ net/sched/act_mirred.c | 6 +++--- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 6ed9692f20bd..32ef544f4ddc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -87,12 +87,13 @@ struct tc_action_ops { struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); - int (*lookup)(struct net *, struct tc_action **, u32); + int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind); int (*walk)(struct net *, struct sk_buff *, - struct netlink_callback *, int, const struct tc_action_ops *); + struct netlink_callback *, int, + const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); struct net_device *(*get_dev)(const struct tc_action *a); }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 624995564e5a..a32e6c2edbf6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -621,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; - if (kind == NULL) + if (!kind) goto err_out; if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; @@ -822,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref) < 0) @@ -934,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (nest == NULL) + if (!nest) goto out_module_put; err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); @@ -1007,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { - if (tb[1] != NULL) + if (tb[1]) return tca_action_flush(net, tb[1], n, portid); - else - return -EINVAL; + + return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e6ff88f72900..abcd5f12b913 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, bool exists = false; int ret; - if (nla == NULL) + if (!nla) return -EINVAL; ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); if (ret < 0) return ret; - if (tb[TCA_MIRRED_PARMS] == NULL) + if (!tb[TCA_MIRRED_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_MIRRED_PARMS]); @@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (dev == NULL) + if (!dev) return -EINVAL; ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); -- cgit v1.2.3-70-g09d2 From aea0d727899140820a631bac78f36e9d9ef15ef6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:54 -0500 Subject: net: sched: act: add extack to init This patch adds extack to tcf_action_init and tcf_action_init_1 functions. These are necessary to make individual extack handling in each act implementation. Based on work by David Ahern Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 17 +++++++++++------ net/sched/cls_api.c | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 32ef544f4ddc..41d95930ffbc 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -163,10 +163,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions); + struct list_head *actions, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind); + char *name, int ovr, int bind, + struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a32e6c2edbf6..662574646256 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -605,7 +605,8 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind) + char *name, int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action *a; struct tc_action_ops *a_o; @@ -726,7 +727,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions) + struct list_head *actions, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -738,7 +739,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind); + act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, + extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1062,12 +1064,14 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr) + struct nlmsghdr *n, u32 portid, int ovr, + struct netlink_ext_ack *extack) { int ret = 0; LIST_HEAD(actions); - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions); + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + extack); if (ret) return ret; @@ -1115,7 +1119,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr); + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, + extack); if (ret == -EAGAIN) goto replay; break; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2bc1bc23d42e..f21610c5da1a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND); + TCA_ACT_BIND, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions); + &actions, extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3-70-g09d2 From 84ae017a00776486390e2c0cceb4f717c3f809c1 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:55 -0500 Subject: net: sched: act: handle generic action errors This patch adds extack support for generic act handling. The extack will be set deeper to each called function which is not part of netdev core api. Based on work by David Ahern Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/sched/act_api.c | 93 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 662574646256..8e77ddd9f0ad 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -617,31 +617,40 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; if (name == NULL) { - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; - if (!kind) + if (!kind) { + NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; - if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + } + if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); goto err_out; + } if (tb[TCA_ACT_COOKIE]) { int cklen = nla_len(tb[TCA_ACT_COOKIE]); - if (cklen > TC_COOKIE_MAX_SIZE) + if (cklen > TC_COOKIE_MAX_SIZE) { + NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); goto err_out; + } cookie = nla_memdup_cookie(tb); if (!cookie) { + NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } } else { - err = -EINVAL; - if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) + if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { + NL_SET_ERR_MSG(extack, "TC action name too long"); + err = -EINVAL; goto err_out; + } } a_o = tc_lookup_action_n(act_name); @@ -664,6 +673,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, goto err_mod; } #endif + NL_SET_ERR_MSG(extack, "Failed to load TC action module"); err = -ENOENT; goto err_out; } @@ -698,6 +708,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, list_add_tail(&a->list, &actions); tcf_action_destroy(&actions, bind); + NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } } @@ -734,7 +745,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, int err; int i; - err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (err < 0) return err; @@ -842,7 +853,8 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event) + struct list_head *actions, int event, + struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -851,6 +863,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } @@ -859,7 +872,8 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid) + struct nlmsghdr *n, u32 portid, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; @@ -867,20 +881,24 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || - nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) + nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { + NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; + } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); - if (!ops) /* could happen in batch of actions */ + if (!ops) { /* could happen in batch of actions */ + NL_SET_ERR_MSG(extack, "Specified TC action not found"); goto err_out; + } err = -ENOENT; if (ops->lookup(net, &a, index) == 0) goto err_mod; @@ -895,7 +913,8 @@ err_out: } static int tca_action_flush(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid) + struct nlmsghdr *n, u32 portid, + struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; @@ -909,35 +928,39 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) { - pr_debug("tca_action_flush: failed skb alloc\n"); + if (!skb) return err; - } b = skb_tail_pointer(skb); - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); - if (!ops) /*some idjot trying to flush unknown action */ + if (!ops) { /*some idjot trying to flush unknown action */ + NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; + } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); - if (!nlh) + if (!nlh) { + NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; + } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); - if (!nest) + if (!nest) { + NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; + } err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); if (err <= 0) { @@ -954,6 +977,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; @@ -966,7 +991,7 @@ err_out: static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -977,6 +1002,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 1) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; } @@ -984,6 +1010,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, /* now do the delete */ ret = tcf_action_destroy(actions, 0); if (ret < 0) { + NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } @@ -997,26 +1024,27 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, - u32 portid, int event) + u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; LIST_HEAD(actions); - ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1]) - return tca_action_flush(net, tb[1], n, portid); + return tca_action_flush(net, tb[1], n, portid, extack); + NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_get_1(net, tb[i], n, portid); + act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; @@ -1026,9 +1054,9 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event); + ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid); + ret = tcf_del_notify(net, n, &actions, portid, extack); if (ret) goto err; return ret; @@ -1041,7 +1069,7 @@ err: static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid) + u32 portid, struct netlink_ext_ack *extack) { struct sk_buff *skb; int err = 0; @@ -1052,6 +1080,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0) <= 0) { + NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while deleting TC action"); kfree_skb(skb); return -EINVAL; } @@ -1075,7 +1104,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, if (ret) return ret; - return tcf_add_notify(net, n, &actions, portid); + return tcf_add_notify(net, n, &actions, portid, extack); } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; @@ -1103,7 +1132,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, return ret; if (tca[TCA_ACT_TAB] == NULL) { - pr_notice("tc_ctl_action: received NO action attribs\n"); + NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } @@ -1126,11 +1155,11 @@ replay: break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - portid, RTM_DELACTION); + portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, - portid, RTM_GETACTION); + portid, RTM_GETACTION, extack); break; default: BUG(); -- cgit v1.2.3-70-g09d2 From 589dad6d71a72dd7912e5070c63f6bf1f561b5cf Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:56 -0500 Subject: net: sched: act: add extack to init callback This patch adds extack support for act init callback api. This prepares to handle extack support inside each specific act implementation. Based on work by David Ahern Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- net/sched/act_api.c | 5 +++-- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 3 ++- net/sched/act_csum.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_ife.c | 2 +- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 2 +- net/sched/act_nat.c | 3 ++- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 3 ++- net/sched/act_sample.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/act_skbmod.c | 2 +- net/sched/act_tunnel_key.c | 2 +- net/sched/act_vlan.c | 2 +- 18 files changed, 24 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 41d95930ffbc..3717e0f2bb1b 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -90,7 +90,7 @@ struct tc_action_ops { int (*lookup)(struct net *net, struct tc_action **a, u32 index); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, - int bind); + int bind, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 8e77ddd9f0ad..576a0c311e5e 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -680,9 +680,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, + extack); else - err = a_o->init(net, nla, est, &a, ovr, bind); + err = a_o->init(net, nla, est, &a, ovr, bind, extack); if (err < 0) goto err_mod; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b3f2c15affa7..b3ebfa9598e2 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -272,7 +272,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind) + int replace, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2b15ba84e0c8..20e0215360b5 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,8 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index b7ba9b06b147..3b8c48bb2683 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,7 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b56986d41c87..912f3398f1c1 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5954e992685a..e5127d400737 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -447,7 +447,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 06e380ae0928..6894cfa83863 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -193,7 +193,7 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -201,7 +201,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index abcd5f12b913..7ccd4c71179f 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -69,7 +69,7 @@ static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 98c6a4b2f523..7e5ebd7f52a6 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -37,7 +37,8 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action **a, int ovr, int bind) + struct tc_action **a, int ovr, int bind, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 349beaffb29e..bb2c35ed6f10 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,7 +132,7 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 95d3c9097b25..6b6facbe251b 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -74,7 +74,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, + struct netlink_ext_ack *extack) { int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 1ba0df238756..f4579ceba1f6 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind) + int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 425eac11f6da..b0346347c5f0 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 5a3f691bb545..7651c9d2182d 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -66,7 +66,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index fa975262dbac..1266449aa8ea 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 0e23aac09ad6..dde01eca7ed0 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -70,7 +70,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index e1a1b3f3983a..6c387310b1b6 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind) + int ovr, int bind, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; -- cgit v1.2.3-70-g09d2 From 331a9295de23a9428adb7f593d0701d393a2079e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:57 -0500 Subject: net: sched: act: add extack for lookup callback This patch adds extack support for act lookup callback api. This prepares to handle extack support inside each specific act implementation. Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- net/sched/act_api.c | 2 +- net/sched/act_bpf.c | 3 ++- net/sched/act_connmark.c | 3 ++- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 3 ++- net/sched/act_nat.c | 3 ++- net/sched/act_pedit.c | 3 ++- net/sched/act_police.c | 3 ++- net/sched/act_sample.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_skbmod.c | 3 ++- net/sched/act_tunnel_key.c | 3 ++- net/sched/act_vlan.c | 3 ++- 18 files changed, 37 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 3717e0f2bb1b..0bd65db506ba 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -87,7 +87,8 @@ struct tc_action_ops { struct tcf_result *); int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); - int (*lookup)(struct net *net, struct tc_action **a, u32 index); + int (*lookup)(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, int bind, struct netlink_ext_ack *extack); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 576a0c311e5e..74ed1e288e57 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -901,7 +901,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, goto err_out; } err = -ENOENT; - if (ops->lookup(net, &a, index) == 0) + if (ops->lookup(net, &a, index, extack) == 0) goto err_mod; module_put(ops->owner); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b3ebfa9598e2..d9654b863347 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -374,7 +374,8 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 20e0215360b5..0504b7600fb6 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -184,7 +184,8 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3b8c48bb2683..bdd17b9ef034 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -638,7 +638,8 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 912f3398f1c1..e1e69e38f4b0 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -208,7 +208,8 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index e5127d400737..0b70fb0cc609 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -831,7 +831,8 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 6894cfa83863..f29af79a2d1f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -310,7 +310,8 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -358,7 +359,8 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7ccd4c71179f..9980c6affb5e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -272,7 +272,8 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 7e5ebd7f52a6..6f6d7667ef9a 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -285,7 +285,8 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index bb2c35ed6f10..308b2680a6d9 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -426,7 +426,8 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 6b6facbe251b..1292f880eab0 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -305,7 +305,8 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, police_net_id); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index f4579ceba1f6..22379b2cfd1a 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -209,7 +209,8 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index b0346347c5f0..3ebf71470977 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -177,7 +177,8 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7651c9d2182d..ff1970e14016 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -215,7 +215,8 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 1266449aa8ea..110d7c1f823d 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -239,7 +239,8 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index dde01eca7ed0..65a19928f1a9 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -298,7 +298,8 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) +static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 6c387310b1b6..03dcbbc5ffd2 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -274,7 +274,8 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, return tcf_generic_walker(tn, skb, cb, type, ops); } -static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) +static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); -- cgit v1.2.3-70-g09d2 From 417801055b8cb4c052e989289ccf24a673178bbc Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:58 -0500 Subject: net: sched: act: add extack for walk callback This patch adds extack support for act walker callback api. This prepares to handle extack support inside each specific act implementation. Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- net/sched/act_api.c | 4 ++-- net/sched/act_bpf.c | 3 ++- net/sched/act_connmark.c | 3 ++- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 3 ++- net/sched/act_nat.c | 3 ++- net/sched/act_pedit.c | 3 ++- net/sched/act_police.c | 3 ++- net/sched/act_sample.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_skbmod.c | 3 ++- net/sched/act_tunnel_key.c | 3 ++- net/sched/act_vlan.c | 3 ++- 18 files changed, 38 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 0bd65db506ba..ab3529255377 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -94,7 +94,8 @@ struct tc_action_ops { int bind, struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, - const struct tc_action_ops *); + const struct tc_action_ops *, + struct netlink_ext_ack *); void (*stats_update)(struct tc_action *, u64, u32, u64); struct net_device *(*get_dev)(const struct tc_action *a); }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 74ed1e288e57..ab107997b259 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -963,7 +963,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, goto out_module_put; } - err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; @@ -1255,7 +1255,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o); + ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index d9654b863347..7e01e2c710c4 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -367,7 +367,8 @@ static void tcf_bpf_cleanup(struct tc_action *act) static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0504b7600fb6..cb722da0bb15 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -177,7 +177,8 @@ nla_put_failure: static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index bdd17b9ef034..3e8efadb750f 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -631,7 +631,8 @@ static void tcf_csum_cleanup(struct tc_action *a) static int tcf_csum_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e1e69e38f4b0..d96ebe4bb65a 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -201,7 +201,8 @@ nla_put_failure: static int tcf_gact_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 0b70fb0cc609..b777e381e0dd 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -824,7 +824,8 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, static int tcf_ife_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index f29af79a2d1f..f33a8cc5dee6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -303,7 +303,8 @@ nla_put_failure: static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ipt_net_id); @@ -352,7 +353,8 @@ static struct pernet_operations ipt_net_ops = { static int tcf_xt_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, xt_net_id); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 9980c6affb5e..3dcd295ea6a7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -265,7 +265,8 @@ nla_put_failure: static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 6f6d7667ef9a..67243cdc0588 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -278,7 +278,8 @@ nla_put_failure: static int tcf_nat_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 308b2680a6d9..6d6481f6bffa 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -419,7 +419,8 @@ nla_put_failure: static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 1292f880eab0..ff803414a736 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -58,7 +58,8 @@ static struct tc_action_ops act_police_ops; static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, police_net_id); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 22379b2cfd1a..7a2b6a33f239 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -202,7 +202,8 @@ nla_put_failure: static int tcf_sample_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3ebf71470977..3f5474d20702 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -170,7 +170,8 @@ nla_put_failure: static int tcf_simp_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index ff1970e14016..d99b6f1f5181 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -208,7 +208,8 @@ nla_put_failure: static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 110d7c1f823d..369ea85d0f02 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -232,7 +232,8 @@ nla_put_failure: static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 65a19928f1a9..bced6fd00d43 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -291,7 +291,8 @@ nla_put_failure: static int tunnel_key_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 03dcbbc5ffd2..7cf409443d02 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -267,7 +267,8 @@ nla_put_failure: static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); -- cgit v1.2.3-70-g09d2 From b36201455aa0749e8708ef97ed9c1c9ece29a113 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:54:59 -0500 Subject: net: sched: act: handle extack in tcf_generic_walker This patch adds extack handling for a common used TC act function "tcf_generic_walker()" to add an extack message on failures. The tcf_generic_walker() function can fail if get a invalid command different than DEL and GET. The naming "action" here is wrong, the correct naming would be command. Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- net/sched/act_api.c | 6 ++++-- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_ife.c | 2 +- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 2 +- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 2 +- net/sched/act_sample.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/act_skbmod.c | 2 +- net/sched/act_tunnel_key.c | 2 +- net/sched/act_vlan.c | 2 +- 18 files changed, 23 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index ab3529255377..9c2f22695025 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -140,7 +140,8 @@ static inline void tc_action_net_exit(struct list_head *net_list, int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops); + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index); bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index ab107997b259..1f65d6ada9ff 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -202,7 +202,8 @@ nla_put_failure: int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, - const struct tc_action_ops *ops) + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; @@ -211,7 +212,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { - WARN(1, "tcf_generic_walker: unknown action %d\n", type); + WARN(1, "tcf_generic_walker: unknown command %d\n", type); + NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 7e01e2c710c4..cb3c5d403c88 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -372,7 +372,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, bpf_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index cb722da0bb15..e4b880fa51fe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -182,7 +182,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, connmark_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3e8efadb750f..d5c2e528d150 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -636,7 +636,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, csum_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index d96ebe4bb65a..f072bcf33760 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -206,7 +206,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, gact_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index b777e381e0dd..a5994cf0512b 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -829,7 +829,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, ife_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index f33a8cc5dee6..9784629090ad 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -308,7 +308,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, ipt_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, @@ -358,7 +358,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, xt_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3dcd295ea6a7..05c2ebe92eca 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -270,7 +270,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, mirred_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 67243cdc0588..4b5848b6c252 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -283,7 +283,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, nat_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 6d6481f6bffa..094303c27c5e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -424,7 +424,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, pedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ff803414a736..ff55bd6c7db0 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -63,7 +63,7 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, police_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 7a2b6a33f239..9765145aaf40 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -207,7 +207,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, sample_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3f5474d20702..8244e221fe4f 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -175,7 +175,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, simp_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index d99b6f1f5181..ddf69fc01bdf 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -213,7 +213,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, skbedit_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 369ea85d0f02..a406f191cb84 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -237,7 +237,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, skbmod_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index bced6fd00d43..41ff9d0e5c62 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -296,7 +296,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 7cf409443d02..71411a255f04 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -272,7 +272,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, { struct tc_action_net *tn = net_generic(net, vlan_net_id); - return tcf_generic_walker(tn, skb, cb, type, ops); + return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, -- cgit v1.2.3-70-g09d2 From 1d4760c75d4f8c7c73f040f0261a09cf1481fdec Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 15 Feb 2018 10:55:00 -0500 Subject: net: sched: act: mirred: add extack support This patch adds extack support for TC mirred action. Cc: David Ahern Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 05c2ebe92eca..fd34015331ab 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -80,13 +80,17 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, bool exists = false; int ret; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; - ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); + } + ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; - if (!tb[TCA_MIRRED_PARMS]) + if (!tb[TCA_MIRRED_PARMS]) { + NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_MIRRED_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -102,6 +106,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (parm->ifindex) { @@ -117,8 +122,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } if (!exists) { - if (!dev) + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); if (ret) -- cgit v1.2.3-70-g09d2 From 1cbec07649ec8c267cdfb486ab4ee73949974ca4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 16 Feb 2018 11:03:03 -0800 Subject: net: Only honor ifindex in IP_PKTINFO if non-0 Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings if the index is actually set in the message. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 008be04ac1cc..9dca0fb8c482 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); if (!ipv6_addr_v4mapped(&src_info->ipi6_addr)) return -EINVAL; - ipc->oif = src_info->ipi6_ifindex; + if (src_info->ipi6_ifindex) + ipc->oif = src_info->ipi6_ifindex; ipc->addr = src_info->ipi6_addr.s6_addr32[3]; continue; } @@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) return -EINVAL; info = (struct in_pktinfo *)CMSG_DATA(cmsg); - ipc->oif = info->ipi_ifindex; + if (info->ipi_ifindex) + ipc->oif = info->ipi_ifindex; ipc->addr = info->ipi_spec_dst.s_addr; break; } -- cgit v1.2.3-70-g09d2 From 7a9b3ec1e19f691f6e69429d851a4084b86e6219 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jan 2018 23:07:06 +0100 Subject: nl80211: remove unnecessary genlmsg_cancel() calls If we free the message immediately, there's no reason to trim it back to the previous size. Done with spatch: @@ identifier msg, hdr; @@ -if (hdr) - genlmsg_cancel(msg, hdr); ... when != msg; nlmsg_free(msg); @@ identifier msg, hdr; @@ -genlmsg_cancel(msg, hdr); ... when != msg; nlmsg_free(msg); Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cc6ec5bab676..412ed8676306 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5847,7 +5847,6 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, return genlmsg_reply(msg, info); nla_put_failure: - genlmsg_cancel(msg, hdr); out: nlmsg_free(msg); return -ENOBUFS; @@ -6328,7 +6327,6 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info) nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: - genlmsg_cancel(msg, hdr); put_failure: nlmsg_free(msg); return -EMSGSIZE; @@ -13732,7 +13730,6 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13780,7 +13777,6 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13868,7 +13864,6 @@ static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13944,7 +13939,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -13984,7 +13978,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14014,7 +14007,6 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14051,7 +14043,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14084,7 +14075,6 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14125,7 +14115,6 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate); @@ -14164,7 +14153,6 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14219,7 +14207,6 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14265,7 +14252,6 @@ static void nl80211_send_remain_on_chan_event( return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14379,7 +14365,6 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_conn_failed); @@ -14416,7 +14401,6 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, return true; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); return true; } @@ -14500,7 +14484,6 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); return -ENOBUFS; } @@ -14544,7 +14527,6 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_mgmt_tx_status); @@ -14753,7 +14735,6 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14811,7 +14792,6 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14864,7 +14844,6 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -14946,7 +14925,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } @@ -15041,7 +15019,6 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_probe_status); @@ -15086,8 +15063,6 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, nla_put_failure: spin_unlock_bh(&rdev->beacon_registrations_lock); - if (hdr) - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_report_obss_beacon); @@ -15303,7 +15278,6 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer, return; nla_put_failure: - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_tdls_oper_request); @@ -15448,8 +15422,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) return; nla_put_failure: - if (hdr) - genlmsg_cancel(msg, hdr); nlmsg_free(msg); } EXPORT_SYMBOL(cfg80211_crit_proto_stopped); -- cgit v1.2.3-70-g09d2 From db8d93a7a355121d49777c059afbca23c53c8628 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Fri, 2 Feb 2018 11:15:27 +0200 Subject: nl80211: Fix external_auth check for offloaded authentication Unfortunately removal of the ext_feature flag in the last revision of the patch ended up negating the comparison and prevented the command from being processed (either nl80211_external_auth() or rdev_external_auth() returns -EOPNOTSUPP). Fix this by adding back the lost '!'. Fixes: 40cbfa90218b ("cfg80211/nl80211: Optional authentication offload to userspace") Signed-off-by: Srinivas Dasari Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 412ed8676306..c6f256b29c73 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12484,7 +12484,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct cfg80211_external_auth_params params; - if (rdev->ops->external_auth) + if (!rdev->ops->external_auth) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_SSID]) -- cgit v1.2.3-70-g09d2 From 11b05ba34d89808ca99180d2656438a319670c56 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Feb 2018 16:31:42 +0000 Subject: mac80211: remove redundant initialization to pointer 'hdr' The pointer hrd is being initialized with a value that is never read and re-assigned a little later, hence the initialization is redundant and can be removed. Cleans up clang warning: net/mac80211/tx.c:1924:24: warning: Value stored to 'hdr' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 25904af38839..5decd0fb247c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1921,7 +1921,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_hdr *hdr; int headroom; bool may_encrypt; -- cgit v1.2.3-70-g09d2 From c4b50cd31d25c3d17886ffc47ca4a9a12c6dc9bf Mon Sep 17 00:00:00 2001 From: Venkateswara Naralasetty Date: Tue, 13 Feb 2018 11:03:06 +0530 Subject: cfg80211: send ack_signal to user in probe client response This patch provides support to get ack signal in probe client response and in station info from user. Signed-off-by: Venkateswara Naralasetty [squash in compilation fixes] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/wil6210/cfg80211.c | 3 ++- include/net/cfg80211.h | 7 ++++++- include/uapi/linux/nl80211.h | 3 +++ net/mac80211/status.c | 2 +- net/wireless/nl80211.c | 8 ++++++-- 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 768f63f38341..b799a5384abb 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1599,7 +1599,8 @@ static void wil_probe_client_handle(struct wil6210_priv *wil, */ bool alive = (sta->status == wil_sta_connected); - cfg80211_probe_status(ndev, sta->addr, req->cookie, alive, GFP_KERNEL); + cfg80211_probe_status(ndev, sta->addr, req->cookie, alive, + 0, false, GFP_KERNEL); } static struct list_head *next_probe_client(struct wil6210_priv *wil) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d49cd0cf92d..56e905cd4b07 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1147,6 +1147,7 @@ struct cfg80211_tid_stats { * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * @ack_signal: signal strength (in dBm) of the last ACK frame. */ struct station_info { u64 filled; @@ -1191,6 +1192,7 @@ struct station_info { u64 rx_duration; u8 rx_beacon_signal_avg; struct cfg80211_tid_stats pertid[IEEE80211_NUM_TIDS + 1]; + s8 ack_signal; }; #if IS_ENABLED(CONFIG_CFG80211) @@ -5838,10 +5840,13 @@ bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, * @addr: the address of the peer * @cookie: the cookie filled in @probe_client previously * @acked: indicates whether probe was acked or not + * @ack_signal: signal strength (in dBm) of the ACK frame. + * @is_valid_ack_signal: indicates the ack_signal is valid or not. * @gfp: allocation flags */ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, - u64 cookie, bool acked, gfp_t gfp); + u64 cookie, bool acked, s32 ack_signal, + bool is_valid_ack_signal, gfp_t gfp); /** * cfg80211_report_obss_beacon - report beacon from other APs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ca3d5a613fc0..c13c84304be3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2626,6 +2626,7 @@ enum nl80211_attrs { NL80211_ATTR_EXTERNAL_AUTH_SUPPORT, NL80211_ATTR_NSS, + NL80211_ATTR_ACK_SIGNAL, /* add attributes here, update the policy in nl80211.c */ @@ -2947,6 +2948,7 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_RX_DURATION: aggregate PPDU duration for all frames * received from the station (u64, usec) * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment + * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -2985,6 +2987,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_TID_STATS, NL80211_STA_INFO_RX_DURATION, NL80211_STA_INFO_PAD, + NL80211_STA_INFO_ACK_SIGNAL, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index da7427a41529..d74d44e65bd7 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -486,7 +486,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, + cookie, acked, 0, false, GFP_ATOMIC); else cfg80211_mgmt_tx_status(&sdata->wdev, cookie, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c6f256b29c73..050ff61b06a3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4486,6 +4486,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); + PUT_SINFO(ACK_SIGNAL, ack_signal, u8); #undef PUT_SINFO #undef PUT_SINFO_U64 @@ -14984,7 +14985,8 @@ nla_put_failure: EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify); void cfg80211_probe_status(struct net_device *dev, const u8 *addr, - u64 cookie, bool acked, gfp_t gfp) + u64 cookie, bool acked, s32 ack_signal, + bool is_valid_ack_signal, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -15009,7 +15011,9 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, NL80211_ATTR_PAD) || - (acked && nla_put_flag(msg, NL80211_ATTR_ACK))) + (acked && nla_put_flag(msg, NL80211_ATTR_ACK)) || + (is_valid_ack_signal && nla_put_s32(msg, NL80211_ATTR_ACK_SIGNAL, + ack_signal))) goto nla_put_failure; genlmsg_end(msg, hdr); -- cgit v1.2.3-70-g09d2 From a78b26fffd2368fcd079802897f4c97f9baea833 Mon Sep 17 00:00:00 2001 From: Venkateswara Naralasetty Date: Tue, 13 Feb 2018 11:04:46 +0530 Subject: mac80211: Add tx ack signal support in sta info This allows users to get ack signal strength of last transmitted frame. Signed-off-by: Venkateswara Naralasetty Signed-off-by: Johannes Berg --- include/net/mac80211.h | 1 + net/mac80211/sta_info.c | 6 ++++++ net/mac80211/sta_info.h | 2 ++ net/mac80211/status.c | 13 +++++++++++-- 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 906e90223066..854037b8163e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -934,6 +934,7 @@ struct ieee80211_tx_info { u8 ampdu_len; u8 antenna; u16 tx_time; + bool is_valid_ack_signal; void *status_driver_data[19 / sizeof(void *)]; } status; struct { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 0c5627f8a104..0bc40c719a55 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2287,6 +2287,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } + + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && + sta->status_stats.ack_signal_filled) { + sinfo->ack_signal = sta->status_stats.last_ack_signal; + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index cd53619435b6..f64eb86ca64b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -548,6 +548,8 @@ struct sta_info { u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; unsigned long last_ack; + s8 last_ack_signal; + bool ack_signal_filled; } status_stats; /* Updated from TX path only, no locking requirements */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index d74d44e65bd7..743e89c5926c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -187,9 +187,16 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *) skb->data; struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb); - if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { sta->status_stats.last_ack = jiffies; + if (txinfo->status.is_valid_ack_signal) { + sta->status_stats.last_ack_signal = + (s8)txinfo->status.ack_signal; + sta->status_stats.ack_signal_filled = true; + } + } if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; @@ -486,7 +493,9 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, if (ieee80211_is_nullfunc(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, - cookie, acked, 0, false, + cookie, acked, + info->status.ack_signal, + info->status.is_valid_ack_signal, GFP_ATOMIC); else cfg80211_mgmt_tx_status(&sdata->wdev, cookie, -- cgit v1.2.3-70-g09d2 From 753d525a08f2e18d50da9a909d48ecb7596683b3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:48:04 +0300 Subject: net: Convert inet6_net_ops init method initializes sysctl defaults, allocates percpu arrays and creates /proc entries. exit method reverts the above. There are no pernet_operations, which are interested in the above entities of foreign net namespace, so inet6_net_ops are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c1e292db04db..dbbe04018813 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,6 +857,7 @@ static void __net_exit inet6_net_exit(struct net *net) static struct pernet_operations inet6_net_ops = { .init = inet6_net_init, .exit = inet6_net_exit, + .async = true, }; static const struct ipv6_stub ipv6_stub_impl = { -- cgit v1.2.3-70-g09d2 From 9c537ca1554e5d7db69a3b606801a2101e02edda Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:48:14 +0300 Subject: net: Convert cfg80211_pernet_ops This patch finishes converting pernet_operations registered in net/wireless directory. These pernet_operations have only exit method, which moves devices to init_net. This action is not pernet_operations-specific, and function cfg80211_switch_netns() may be called all time during the system life. All necessary protection against concurrent cfg80211_pernet_exit() is made by rtnl_lock(). So, cfg80211_pernet_ops is able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/wireless/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index a6f3cac8c640..670aa229168a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1340,6 +1340,7 @@ static void __net_exit cfg80211_pernet_exit(struct net *net) static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, + .async = true, }; static int __init cfg80211_init(void) -- cgit v1.2.3-70-g09d2 From b01a59a4884ed2d95b8db4741e552a689c18989b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:48:37 +0300 Subject: net: Convert ip6mr_net_ops These pernet_operations create and destroy /proc entries, populate and depopulate net::rules_ops and multiroute table. All the structures are pernet, and they are not touched by foreign net pernet_operations. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 9f6cace9c817..295eb5ecaee5 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1397,6 +1397,7 @@ static void __net_exit ip6mr_net_exit(struct net *net) static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, + .async = true, }; int __init ip6_mr_init(void) -- cgit v1.2.3-70-g09d2 From 1a2e93329dd413585798c122aac0607093b2d379 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:48:57 +0300 Subject: net: Convert icmpv6_sk_ops, ndisc_net_ops and igmp6_net_ops These pernet_operations create and destroy net::ipv6.icmp_sk socket, used to send ICMP or error reply. Nobody can dereference the socket to handle a packet before net is initialized, as there is no routing; nobody can do that in parallel with exit, as all of devices are moved to init_net or destroyed and there are no packets it-flight. So, it's possible to mark these pernet_operations as async. The same for ndisc_net_ops and for igmp6_net_ops. The last one also creates and destroys /proc entries. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 1 + net/ipv6/mcast.c | 1 + net/ipv6/ndisc.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6ae5dd3f4d0d..4fa4f1b150a4 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -997,6 +997,7 @@ static void __net_exit icmpv6_sk_exit(struct net *net) static struct pernet_operations icmpv6_sk_ops = { .init = icmpv6_sk_init, .exit = icmpv6_sk_exit, + .async = true, }; int __init icmpv6_init(void) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9b9d2ff01b35..d9bb933dd5c4 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2997,6 +2997,7 @@ static void __net_exit igmp6_net_exit(struct net *net) static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, + .async = true, }; int __init igmp6_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index f61a5b613b52..0a19ce3a6f7f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1882,6 +1882,7 @@ static void __net_exit ndisc_net_exit(struct net *net) static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, + .async = true, }; int __init ndisc_init(void) -- cgit v1.2.3-70-g09d2 From 509114112d0b71997dbe58d4f2976eeddf8eacc4 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:10 +0300 Subject: net: Convert raw6_net_ops, udplite6_net_ops, ipv6_proc_ops, if6_proc_net_ops and ip6_route_net_late_ops These pernet_operations create and destroy /proc entries and safely may be converted and safely may be mark as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 1 + net/ipv6/proc.c | 1 + net/ipv6/raw.c | 1 + net/ipv6/route.c | 2 ++ 4 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8c17f8d8d5d9..4facfe0b1888 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4257,6 +4257,7 @@ static void __net_exit if6_proc_net_exit(struct net *net) static struct pernet_operations if6_proc_net_ops = { .init = if6_proc_net_init, .exit = if6_proc_net_exit, + .async = true, }; int __init if6_proc_init(void) diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b67814242f78..b8858c546f41 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -343,6 +343,7 @@ static void __net_exit ipv6_proc_exit_net(struct net *net) static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, + .async = true, }; int __init ipv6_misc_proc_init(void) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c25339b1984..10a4ac4933b7 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1332,6 +1332,7 @@ static void __net_exit raw6_exit_net(struct net *net) static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, + .async = true, }; int __init raw6_proc_init(void) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f0baae26db8f..1be84ef23f43 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4972,6 +4972,7 @@ static void __net_exit ip6_route_net_exit_late(struct net *net) static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, + .async = true, }; static int __net_init ipv6_inetpeer_init(struct net *net) @@ -5002,6 +5003,7 @@ static struct pernet_operations ipv6_inetpeer_ops = { static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, + .async = true, }; static struct notifier_block ip6_route_dev_notifier = { -- cgit v1.2.3-70-g09d2 From 85ca51b2a23969dccb5abff31eb54560db8195ca Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:20 +0300 Subject: net: Convert ipv6_inetpeer_ops net->ipv6.peers is dereferenced in three places via inet_getpeer_v6(), and it's used to handle skb. All the users of inet_getpeer_v6() do not look like be able to be called from foreign net pernet_operations, so we may mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1be84ef23f43..aa709b644945 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4998,6 +4998,7 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net) static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, + .async = true, }; static struct pernet_operations ip6_route_net_late_ops = { -- cgit v1.2.3-70-g09d2 From 7b7dd180b85b9c39c426cfaade73f7d0409c7f85 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:31 +0300 Subject: net: Convert fib6_rules_net_ops These pernet_operations register and unregister net::ipv6.fib6_rules_ops, which are used for routing. It looks like there are no pernet_operations, which send ipv6 packages to another net, so we are able to mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b240f24a6e52..95a2c9e8699a 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -368,6 +368,7 @@ static void __net_exit fib6_rules_net_exit(struct net *net) static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit = fib6_rules_net_exit, + .async = true, }; int __init fib6_rules_init(void) -- cgit v1.2.3-70-g09d2 From fef65a2c6c3417fcf2fb0bebd8c27355d1b14a12 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:40 +0300 Subject: net: Convert tcpv6_net_ops These pernet_operations create and destroy net::ipv6.tcp_sk socket, which is used in tcp_v6_send_response() only. It looks like foreign pernet_operations don't want to set ipv6 connection inside destroyed net, so this socket may be created in destroyed in parallel with anything else. inet_twsk_purge() is also safe for that, as described in patch for tcp_sk_ops. So, it's possible to mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 883df0ad5bfe..5425d7b100ee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2007,6 +2007,7 @@ static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, .exit_batch = tcpv6_net_exit_batch, + .async = true, }; int __init tcpv6_init(void) -- cgit v1.2.3-70-g09d2 From 58708caef56b04da2c91d6aaba49c783f22055c2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:49 +0300 Subject: net: Convert ipv6_sysctl_net_ops These pernet_operations create and destroy sysctl tables. They are not touched by another net pernet_operations. So, it's possible to execute them in parallel with others. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/sysctl_net_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index a789a8ac6a64..262f791f1b9b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -251,6 +251,7 @@ static void __net_exit ipv6_sysctl_net_exit(struct net *net) static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, + .async = true, }; static struct ctl_table_header *ip6_header; -- cgit v1.2.3-70-g09d2 From ac34cb6c0c4d327d69e588d7a42bbc6428e2fdfd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:49:59 +0300 Subject: net: Convert ping_v6_net_ops These pernet_operations only register and unregister /proc entries, so it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index d12c55dad7d1..318c6e914234 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -240,6 +240,7 @@ static void __net_init ping_v6_proc_exit_net(struct net *net) static struct pernet_operations ping_v6_net_ops = { .init = ping_v6_proc_init_net, .exit = ping_v6_proc_exit_net, + .async = true, }; #endif -- cgit v1.2.3-70-g09d2 From a7852a76f414f69631ed3adc4b001c633829306d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:09 +0300 Subject: net: Convert ip6_flowlabel_net_ops These pernet_operations create and destroy /proc entries. ip6_fl_purge() makes almost the same actions as timer ip6_fl_gc_timer does, and as it can be executed in parallel with ip6_fl_purge(), two parallel ip6_fl_purge() may be executed. So, we can mark it async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6_flowlabel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 3dab664ff503..6ddf52282894 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -873,6 +873,7 @@ static void __net_exit ip6_flowlabel_net_exit(struct net *net) static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, + .async = true, }; int ip6_flowlabel_init(void) -- cgit v1.2.3-70-g09d2 From b489141369f78ead6ed540cff29ac1974852cd7f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:18 +0300 Subject: net: Convert xfrm6_net_ops These pernet_operations create sysctl tables and initialize net::xfrm.xfrm6_dst_ops used for routing. It doesn't look like another pernet_operations send ipv6 packets to foreign net namespaces, so it should be safe to mark the pernet_operations as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/xfrm6_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 09fb44ee3b45..88cd0c90fa81 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -395,6 +395,7 @@ static void __net_exit xfrm6_net_exit(struct net *net) static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, + .async = true, }; int __init xfrm6_init(void) -- cgit v1.2.3-70-g09d2 From d16784d9fb2a91e96374df7cb689f52cef55371b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:28 +0300 Subject: net: Convert fib6_net_ops, ipv6_addr_label_ops and ip6_segments_ops These pernet_operations register and unregister tables and lists for packets forwarding. All of the entities are per-net. Init methods makes simple initializations, and since net is not visible for foreigners at the time it is working, it can't race with anything. Exit method is executed when there are only local devices, and there mustn't be packets in-flight. Also, it looks like no one pernet_operations want to send ipv6 packets to foreign net. The same reasons are for ipv6_addr_label_ops and ip6_segments_ops. So, we are able to mark all them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/addrlabel.c | 1 + net/ipv6/ip6_fib.c | 1 + net/ipv6/seg6.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 1d6ced37ad71..ba2e63633370 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -344,6 +344,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) static struct pernet_operations ipv6_addr_label_ops = { .init = ip6addrlbl_net_init, .exit = ip6addrlbl_net_exit, + .async = true, }; int __init ipv6_addr_label_init(void) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 92b8d8c75eed..cab95cf3b39f 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2160,6 +2160,7 @@ static void fib6_net_exit(struct net *net) static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, + .async = true, }; int __init fib6_init(void) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 7f5621d09571..c3f13c3bd8a9 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -395,6 +395,7 @@ static void __net_exit seg6_net_exit(struct net *net) static struct pernet_operations ip6_segments_ops = { .init = seg6_net_init, .exit = seg6_net_exit, + .async = true, }; static const struct genl_ops seg6_genl_ops[] = { -- cgit v1.2.3-70-g09d2 From 5fc094f5b8c9b8e5012b7ccf303be3b123563a58 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:37 +0300 Subject: net: Convert ip6_frags_ops Exit methods calls inet_frags_exit_net() with global ip6_frags as argument. So, after we make the pernet_operations async, a pair of exit methods may be called to iterate this hash table. Since there is inet_frag_worker(), which already may work in parallel with inet_frags_exit_net(), and it can make the same cleanup, that inet_frags_exit_net() does, it's safe. So we may mark these pernet_operations as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index afbc000ad4f2..b5da69c83123 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -733,6 +733,7 @@ static void __net_exit ipv6_frags_exit_net(struct net *net) static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .exit = ipv6_frags_exit_net, + .async = true, }; int __init ipv6_frag_init(void) -- cgit v1.2.3-70-g09d2 From 4d6b80762b9384db3281f792a71746b388f395be Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:45 +0300 Subject: net: Convert ip_tables_net_ops, udplite6_net_ops and xt_net_ops ip_tables_net_ops and udplite6_net_ops create and destroy /proc entries. xt_net_ops does nothing. So, we are able to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/udplite.c | 1 + net/netfilter/x_tables.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 9a71f3149507..39a7cf9160e6 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1911,6 +1911,7 @@ static void __net_exit ip_tables_net_exit(struct net *net) static struct pernet_operations ip_tables_net_ops = { .init = ip_tables_net_init, .exit = ip_tables_net_exit, + .async = true, }; static int __init ip_tables_init(void) diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 14ae32bb1f3d..f3839780dc31 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -123,6 +123,7 @@ static void __net_exit udplite6_proc_exit_net(struct net *net) static struct pernet_operations udplite6_net_ops = { .init = udplite6_proc_init_net, .exit = udplite6_proc_exit_net, + .async = true, }; int __init udplite6_proc_init(void) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2f685ee1f9c8..a6a435d7c8f4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1765,6 +1765,7 @@ static void __net_exit xt_net_exit(struct net *net) static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, + .async = true, }; static int __init xt_init(void) -- cgit v1.2.3-70-g09d2 From da349fad8045cc44cd9b1752b0e2d943df62cabf Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 11:50:54 +0300 Subject: net: Convert iptable_filter_net_ops These pernet_operations register and unregister net::ipv4.iptable_filter table. Since there are no packets in-flight at the time of exit method is working, iptables rules should not be touched. Also, pernet_operations should not send ipv4 packets each other. So, it's safe to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 9ac92ea7b93c..c1c136a93911 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -87,6 +87,7 @@ static void __net_exit iptable_filter_net_exit(struct net *net) static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .exit = iptable_filter_net_exit, + .async = true, }; static int __init iptable_filter_init(void) -- cgit v1.2.3-70-g09d2 From 96c252bf1c5c6d7e2dac3dea42f3f0a9c939d20e Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Mon, 19 Feb 2018 12:48:21 +0100 Subject: tipc: fix bug on error path in tipc_topsrv_kern_subscr() In commit cc1ea9ffadf7 ("tipc: eliminate struct tipc_subscriber") we re-introduced an old bug on the error path in the function tipc_topsrv_kern_subscr(). We now re-introduce the correction too. Reported-by: syzbot+f62e0f2a0ef578703946@syzkaller.appspotmail.com Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 02013e00f287..25925be1cc08 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -580,9 +580,10 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, *conid = con->conid; con->sock = NULL; rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub); - if (rc < 0) - tipc_conn_close(con); - return !rc; + if (rc >= 0) + return true; + conn_put(con); + return false; } void tipc_topsrv_kern_unsubscr(struct net *net, int conid) -- cgit v1.2.3-70-g09d2 From 26736a08ee0fb89a4f09bfb2c9f0805028ff63aa Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 19 Feb 2018 19:02:24 +0100 Subject: tipc: don't call sock_release() in atomic context syzbot reported a scheduling while atomic issue at netns destruction time: BUG: sleeping function called from invalid context at net/core/sock.c:2769 in_atomic(): 1, irqs_disabled(): 0, pid: 85, name: kworker/u4:3 5 locks held by kworker/u4:3/85: #0: ((wq_completion)"%s""netns"){+.+.}, at: [<00000000c9792deb>] process_one_work+0xaaf/0x1af0 kernel/workqueue.c:2084 #1: (net_cleanup_work){+.+.}, at: [<00000000adc12e2a>] process_one_work+0xb01/0x1af0 kernel/workqueue.c:2088 #2: (net_sem){++++}, at: [<000000009ccb5669>] cleanup_net+0x23f/0xd20 net/core/net_namespace.c:494 #3: (net_mutex){+.+.}, at: [<00000000a92767d9>] cleanup_net+0xa7d/0xd20 net/core/net_namespace.c:496 #4: (&(&srv->idr_lock)->rlock){+...}, at: [<000000001343e568>] spin_lock_bh include/linux/spinlock.h:315 [inline] #4: (&(&srv->idr_lock)->rlock){+...}, at: [<000000001343e568>] tipc_topsrv_stop+0x231/0x610 net/tipc/topsrv.c:685 CPU: 0 PID: 85 Comm: kworker/u4:3 Not tainted 4.16.0-rc1+ #230 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: netns cleanup_net Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6128 __might_sleep+0x95/0x190 kernel/sched/core.c:6081 lock_sock_nested+0x37/0x110 net/core/sock.c:2769 lock_sock include/net/sock.h:1463 [inline] tipc_release+0x103/0xff0 net/tipc/socket.c:572 sock_release+0x8d/0x1e0 net/socket.c:594 tipc_topsrv_stop+0x3c0/0x610 net/tipc/topsrv.c:696 tipc_exit_net+0x15/0x40 net/tipc/core.c:96 ops_exit_list.isra.6+0xae/0x150 net/core/net_namespace.c:148 cleanup_net+0x6ba/0xd20 net/core/net_namespace.c:529 process_one_work+0xbbf/0x1af0 kernel/workqueue.c:2113 worker_thread+0x223/0x1990 kernel/workqueue.c:2247 kthread+0x33c/0x400 kernel/kthread.c:238 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:429 This is caused by tipc_topsrv_stop() releasing the listener socket with the idr lock held. This changeset addresses the issue moving the release operation outside such lock. Reported-and-tested-by: syzbot+749d9d87c294c00ca856@syzkaller.appspotmail.com Fixes: 0ef897be12b8 ("tipc: separate topology server listener socket from subcsriber sockets") Signed-off-by: Paolo Abeni Acked-by: ///jon Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 25925be1cc08..c8e34ef22c30 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -694,9 +694,9 @@ void tipc_topsrv_stop(struct net *net) } __module_get(lsock->ops->owner); __module_get(lsock->sk->sk_prot_creator->owner); - sock_release(lsock); srv->listener = NULL; spin_unlock_bh(&srv->idr_lock); + sock_release(lsock); tipc_topsrv_work_stop(srv); idr_destroy(&srv->conn_idr); kfree(srv); -- cgit v1.2.3-70-g09d2 From 19efbd93e6fb05eab81856b4fc8d64211dd37088 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:38 +0300 Subject: net: Kill net_mutex We take net_mutex, when there are !async pernet_operations registered, and read locking of net_sem is not enough. But we may get rid of taking the mutex, and just change the logic to write lock net_sem in such cases. This obviously reduces the number of lock operations, we do. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 - include/net/net_namespace.h | 11 ++++++---- net/core/net_namespace.c | 53 +++++++++++++++++++++++++++------------------ 3 files changed, 39 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e9ee9ad0a681..3573b4bf2fdf 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -35,7 +35,6 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct mutex net_mutex; extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9158ec1ad06f..115b01b92f4d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,8 +60,11 @@ struct net { struct list_head list; /* list of network namespaces */ struct list_head cleanup_list; /* namespaces on death row */ - struct list_head exit_list; /* Use only net_mutex */ - + struct list_head exit_list; /* To linked to call pernet exit + * methods on dead net (net_sem + * read locked), or to unregister + * pernet ops (net_sem wr locked). + */ struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; spinlock_t nsid_lock; @@ -89,7 +92,7 @@ struct net { /* core fib_rules */ struct list_head rules_ops; - struct list_head fib_notifier_ops; /* protected by net_mutex */ + struct list_head fib_notifier_ops; /* protected by net_sem */ struct net_device *loopback_dev; /* The loopback */ struct netns_core core; @@ -316,7 +319,7 @@ struct pernet_operations { /* * Indicates above methods are allowed to be executed in parallel * with methods of any other pernet_operations, i.e. they are not - * need synchronization via net_mutex. + * need write locked net_sem. */ bool async; }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bcab9a938d6f..e89a516620dd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -29,8 +29,6 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -/* Used only if there are !async pernet_operations registered */ -DEFINE_MUTEX(net_mutex); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -407,6 +405,7 @@ struct net *copy_net_ns(unsigned long flags, { struct ucounts *ucounts; struct net *net; + unsigned write; int rv; if (!(flags & CLONE_NEWNET)) @@ -424,20 +423,26 @@ struct net *copy_net_ns(unsigned long flags, refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); - - rv = down_read_killable(&net_sem); +again: + write = READ_ONCE(nr_sync_pernet_ops); + if (write) + rv = down_write_killable(&net_sem); + else + rv = down_read_killable(&net_sem); if (rv < 0) goto put_userns; - if (nr_sync_pernet_ops) { - rv = mutex_lock_killable(&net_mutex); - if (rv < 0) - goto up_read; + + if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { + up_read(&net_sem); + goto again; } rv = setup_net(net, user_ns); - if (nr_sync_pernet_ops) - mutex_unlock(&net_mutex); -up_read: - up_read(&net_sem); + + if (write) + up_write(&net_sem); + else + up_read(&net_sem); + if (rv < 0) { put_userns: put_user_ns(user_ns); @@ -485,15 +490,23 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct list_head net_kill_list; LIST_HEAD(net_exit_list); + unsigned write; /* Atomically snapshot the list of namespaces to cleanup */ spin_lock_irq(&cleanup_list_lock); list_replace_init(&cleanup_list, &net_kill_list); spin_unlock_irq(&cleanup_list_lock); +again: + write = READ_ONCE(nr_sync_pernet_ops); + if (write) + down_write(&net_sem); + else + down_read(&net_sem); - down_read(&net_sem); - if (nr_sync_pernet_ops) - mutex_lock(&net_mutex); + if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { + up_read(&net_sem); + goto again; + } /* Don't let anyone else find us. */ rtnl_lock(); @@ -528,14 +541,14 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); - if (nr_sync_pernet_ops) - mutex_unlock(&net_mutex); - /* Free the net generic variables */ list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - up_read(&net_sem); + if (write) + up_write(&net_sem); + else + up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -563,8 +576,6 @@ static void cleanup_net(struct work_struct *work) void net_ns_barrier(void) { down_write(&net_sem); - mutex_lock(&net_mutex); - mutex_unlock(&net_mutex); up_write(&net_sem); } EXPORT_SYMBOL(net_ns_barrier); -- cgit v1.2.3-70-g09d2 From 65b7b5b90fcd17b25ef43b0cd02bda47bf286675 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:45 +0300 Subject: net: Make cleanup_list and net::cleanup_list of llist type This simplifies cleanup queueing and makes cleanup lists to use llist primitives. Since llist has its own cmpxchg() ordering, cleanup_list_lock is not more need. Also, struct llist_node is smaller, than struct list_head, so we save some bytes in struct net with this patch. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/net_namespace.h | 3 ++- net/core/net_namespace.c | 20 ++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 115b01b92f4d..d4417495773a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -59,12 +59,13 @@ struct net { atomic64_t cookie_gen; struct list_head list; /* list of network namespaces */ - struct list_head cleanup_list; /* namespaces on death row */ struct list_head exit_list; /* To linked to call pernet exit * methods on dead net (net_sem * read locked), or to unregister * pernet ops (net_sem wr locked). */ + struct llist_node cleanup_list; /* namespaces on death row */ + struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; spinlock_t nsid_lock; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e89a516620dd..abf8a46e94e2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -481,21 +481,18 @@ static void unhash_nsid(struct net *net, struct net *last) spin_unlock_bh(&net->nsid_lock); } -static DEFINE_SPINLOCK(cleanup_list_lock); -static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ +static LLIST_HEAD(cleanup_list); static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; struct net *net, *tmp, *last; - struct list_head net_kill_list; + struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); unsigned write; /* Atomically snapshot the list of namespaces to cleanup */ - spin_lock_irq(&cleanup_list_lock); - list_replace_init(&cleanup_list, &net_kill_list); - spin_unlock_irq(&cleanup_list_lock); + net_kill_list = llist_del_all(&cleanup_list); again: write = READ_ONCE(nr_sync_pernet_ops); if (write) @@ -510,7 +507,7 @@ again: /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer @@ -525,7 +522,7 @@ again: last = list_last_entry(&net_namespace_list, struct net, list); rtnl_unlock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); list_add_tail(&net->exit_list, &net_exit_list); } @@ -585,12 +582,7 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - unsigned long flags; - - spin_lock_irqsave(&cleanup_list_lock, flags); - list_add(&net->cleanup_list, &cleanup_list); - spin_unlock_irqrestore(&cleanup_list_lock, flags); - + llist_add(&net->cleanup_list, &cleanup_list); queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); -- cgit v1.2.3-70-g09d2 From 8349efd903394422d1598d196b6be70c410cf8f5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:54 +0300 Subject: net: Queue net_cleanup_work only if there is first net added When llist_add() returns false, cleanup_net() hasn't made its llist_del_all(), while the work has already been scheduled by the first queuer. So, we may skip queue_work() in this case. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index abf8a46e94e2..27a55236ad64 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -582,8 +582,8 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) { /* Cleanup the network namespace in process context */ - llist_add(&net->cleanup_list, &cleanup_list); - queue_work(netns_wq, &net_cleanup_work); + if (llist_add(&net->cleanup_list, &cleanup_list)) + queue_work(netns_wq, &net_cleanup_work); } EXPORT_SYMBOL_GPL(__put_net); -- cgit v1.2.3-70-g09d2 From cc944ead839a28b07cea4f90878f2b7e0cb71a02 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Tue, 20 Feb 2018 08:44:20 +0100 Subject: devlink: Move size validation to core Currently the size validation is done via a cb, which is unneeded. The size validation can be moved to core. The next patch will perform cleanup. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 18d385ed8237..88e846779269 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2338,6 +2338,32 @@ out: resource->size_valid = size_valid; } +static int +devlink_resource_validate_size(struct devlink_resource *resource, u64 size, + struct netlink_ext_ack *extack) +{ + u64 reminder; + int err = 0; + + if (size > resource->size_params->size_max) { + NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum"); + err = -EINVAL; + } + + if (size < resource->size_params->size_min) { + NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum"); + err = -EINVAL; + } + + div64_u64_rem(size, resource->size_params->size_granularity, &reminder); + if (reminder) { + NL_SET_ERR_MSG_MOD(extack, "Wrong granularity"); + err = -EINVAL; + } + + return err; +} + static int devlink_nl_cmd_resource_set(struct sk_buff *skb, struct genl_info *info) { @@ -2356,12 +2382,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb, if (!resource) return -EINVAL; - if (!resource->resource_ops->size_validate) - return -EINVAL; - size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); - err = resource->resource_ops->size_validate(devlink, size, - info->extack); + err = devlink_resource_validate_size(resource, size, info->extack); if (err) return err; -- cgit v1.2.3-70-g09d2 From 3fef2b6290e85de93a7096381e77f9a1b7544278 Mon Sep 17 00:00:00 2001 From: Antonio Cardace Date: Mon, 19 Feb 2018 11:37:15 +0000 Subject: x25: use %*ph to print small buffer Use %*ph format to print small buffer as hex string. Suggested-by: Andy Shevchenko Signed-off-by: Antonio Cardace Signed-off-by: David S. Miller --- net/x25/x25_subr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c index db0b1315d577..9c214ec681ac 100644 --- a/net/x25/x25_subr.c +++ b/net/x25/x25_subr.c @@ -335,8 +335,7 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q, } } - pr_debug("invalid PLP frame %02X %02X %02X\n", - frame[0], frame[1], frame[2]); + pr_debug("invalid PLP frame %3ph\n", frame); return X25_ILLEGAL; } -- cgit v1.2.3-70-g09d2 From ccc007e4a746bb592d3e72106f00241f81d51410 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Thu, 15 Feb 2018 19:42:43 +0200 Subject: net: sched: add em_ipt ematch for calling xtables matches The commit a new tc ematch for using netfilter xtable matches. This allows early classification as well as mirroning/redirecting traffic based on logic implemented in netfilter extensions. Current supported use case is classification based on the incoming IPSec state used during decpsulation using the 'policy' iptables extension (xt_policy). The module dynamically fetches the netfilter match module and calls it using a fake xt_action_param structure based on validated userspace provided parameters. As the xt_policy match does not access skb->data, no skb modifications are needed on match. Signed-off-by: Eyal Birger Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 3 +- include/uapi/linux/tc_ematch/tc_em_ipt.h | 20 +++ net/sched/Kconfig | 12 ++ net/sched/Makefile | 1 + net/sched/em_ipt.c | 257 +++++++++++++++++++++++++++++++ 5 files changed, 292 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/tc_ematch/tc_em_ipt.h create mode 100644 net/sched/em_ipt.c (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 46c506615f4a..7cafb26df555 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -555,7 +555,8 @@ enum { #define TCF_EM_VLAN 6 #define TCF_EM_CANID 7 #define TCF_EM_IPSET 8 -#define TCF_EM_MAX 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 enum { TCF_EM_PROG_TC diff --git a/include/uapi/linux/tc_ematch/tc_em_ipt.h b/include/uapi/linux/tc_ematch/tc_em_ipt.h new file mode 100644 index 000000000000..49a65530992c --- /dev/null +++ b/include/uapi/linux/tc_ematch/tc_em_ipt.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_TC_EM_IPT_H +#define __LINUX_TC_EM_IPT_H + +#include +#include + +enum { + TCA_EM_IPT_UNSPEC, + TCA_EM_IPT_HOOK, + TCA_EM_IPT_MATCH_NAME, + TCA_EM_IPT_MATCH_REVISION, + TCA_EM_IPT_NFPROTO, + TCA_EM_IPT_MATCH_DATA, + __TCA_EM_IPT_MAX +}; + +#define TCA_EM_IPT_MAX (__TCA_EM_IPT_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index f24a6ae6819a..a01169fb5325 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -658,6 +658,18 @@ config NET_EMATCH_IPSET To compile this code as a module, choose M here: the module will be called em_ipset. +config NET_EMATCH_IPT + tristate "IPtables Matches" + depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES + ---help--- + Say Y here to be able to classify packets based on iptables + matches. + Current supported match is "policy" which allows packet classification + based on IPsec policy that was used during decapsulation + + To compile this code as a module, choose M here: the + module will be called em_ipt. + config NET_CLS_ACT bool "Actions" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 5b635447e3f8..8811d3804878 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META) += em_meta.o obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o +obj-$(CONFIG_NET_EMATCH_IPT) += em_ipt.o diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c new file mode 100644 index 000000000000..a5f34e930eff --- /dev/null +++ b/net/sched/em_ipt.c @@ -0,0 +1,257 @@ +/* + * net/sched/em_ipt.c IPtables matches Ematch + * + * (c) 2018 Eyal Birger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct em_ipt_match { + const struct xt_match *match; + u32 hook; + u8 match_data[0] __aligned(8); +}; + +struct em_ipt_xt_match { + char *match_name; + int (*validate_match_data)(struct nlattr **tb, u8 mrev); +}; + +static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = { + [TCA_EM_IPT_MATCH_NAME] = { .type = NLA_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, + [TCA_EM_IPT_MATCH_REVISION] = { .type = NLA_U8 }, + [TCA_EM_IPT_HOOK] = { .type = NLA_U32 }, + [TCA_EM_IPT_NFPROTO] = { .type = NLA_U8 }, + [TCA_EM_IPT_MATCH_DATA] = { .type = NLA_UNSPEC }, +}; + +static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len) +{ + struct xt_mtchk_param mtpar = {}; + union { + struct ipt_entry e4; + struct ip6t_entry e6; + } e = {}; + + mtpar.net = net; + mtpar.table = "filter"; + mtpar.hook_mask = 1 << im->hook; + mtpar.family = im->match->family; + mtpar.match = im->match; + mtpar.entryinfo = &e; + mtpar.matchinfo = (void *)im->match_data; + return xt_check_match(&mtpar, mdata_len, 0, 0); +} + +static int policy_validate_match_data(struct nlattr **tb, u8 mrev) +{ + if (mrev != 0) { + pr_err("only policy match revision 0 supported"); + return -EINVAL; + } + + if (nla_get_u32(tb[TCA_EM_IPT_HOOK]) != NF_INET_PRE_ROUTING) { + pr_err("policy can only be matched on NF_INET_PRE_ROUTING"); + return -EINVAL; + } + + return 0; +} + +static const struct em_ipt_xt_match em_ipt_xt_matches[] = { + { + .match_name = "policy", + .validate_match_data = policy_validate_match_data + }, + {} +}; + +static struct xt_match *get_xt_match(struct nlattr **tb) +{ + const struct em_ipt_xt_match *m; + struct nlattr *mname_attr; + u8 nfproto, mrev = 0; + int ret; + + mname_attr = tb[TCA_EM_IPT_MATCH_NAME]; + for (m = em_ipt_xt_matches; m->match_name; m++) { + if (!nla_strcmp(mname_attr, m->match_name)) + break; + } + + if (!m->match_name) { + pr_err("Unsupported xt match"); + return ERR_PTR(-EINVAL); + } + + if (tb[TCA_EM_IPT_MATCH_REVISION]) + mrev = nla_get_u8(tb[TCA_EM_IPT_MATCH_REVISION]); + + ret = m->validate_match_data(tb, mrev); + if (ret < 0) + return ERR_PTR(ret); + + nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]); + return xt_request_find_match(nfproto, m->match_name, mrev); +} + +static int em_ipt_change(struct net *net, void *data, int data_len, + struct tcf_ematch *em) +{ + struct nlattr *tb[TCA_EM_IPT_MAX + 1]; + struct em_ipt_match *im = NULL; + struct xt_match *match; + int mdata_len, ret; + + ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy, + NULL); + if (ret < 0) + return ret; + + if (!tb[TCA_EM_IPT_HOOK] || !tb[TCA_EM_IPT_MATCH_NAME] || + !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO]) + return -EINVAL; + + match = get_xt_match(tb); + if (IS_ERR(match)) { + pr_err("unable to load match\n"); + return PTR_ERR(match); + } + + mdata_len = XT_ALIGN(nla_len(tb[TCA_EM_IPT_MATCH_DATA])); + im = kzalloc(sizeof(*im) + mdata_len, GFP_KERNEL); + if (!im) { + ret = -ENOMEM; + goto err; + } + + im->match = match; + im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]); + nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len); + + ret = check_match(net, im, mdata_len); + if (ret) + goto err; + + em->datalen = sizeof(*im) + mdata_len; + em->data = (unsigned long)im; + return 0; + +err: + kfree(im); + module_put(match->me); + return ret; +} + +static void em_ipt_destroy(struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (!im) + return; + + if (im->match->destroy) { + struct xt_mtdtor_param par = { + .net = em->net, + .match = im->match, + .matchinfo = im->match_data, + .family = im->match->family + }; + im->match->destroy(&par); + } + module_put(im->match->me); + kfree((void *)im); +} + +static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + const struct em_ipt_match *im = (const void *)em->data; + struct xt_action_param acpar = {}; + struct net_device *indev = NULL; + struct nf_hook_state state; + int ret; + + rcu_read_lock(); + + if (skb->skb_iif) + indev = dev_get_by_index_rcu(em->net, skb->skb_iif); + + nf_hook_state_init(&state, im->hook, im->match->family, + indev ?: skb->dev, skb->dev, NULL, em->net, NULL); + + acpar.match = im->match; + acpar.matchinfo = im->match_data; + acpar.state = &state; + + ret = im->match->match(skb, &acpar); + + rcu_read_unlock(); + return ret; +} + +static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct em_ipt_match *im = (void *)em->data; + + if (nla_put_string(skb, TCA_EM_IPT_MATCH_NAME, im->match->name) < 0) + return -EMSGSIZE; + if (nla_put_u32(skb, TCA_EM_IPT_HOOK, im->hook) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0) + return -EMSGSIZE; + if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0) + return -EMSGSIZE; + if (nla_put(skb, TCA_EM_IPT_MATCH_DATA, + im->match->usersize ?: im->match->matchsize, + im->match_data) < 0) + return -EMSGSIZE; + + return 0; +} + +static struct tcf_ematch_ops em_ipt_ops = { + .kind = TCF_EM_IPT, + .change = em_ipt_change, + .destroy = em_ipt_destroy, + .match = em_ipt_match, + .dump = em_ipt_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_ipt_ops.link) +}; + +static int __init init_em_ipt(void) +{ + return tcf_em_register(&em_ipt_ops); +} + +static void __exit exit_em_ipt(void) +{ + tcf_em_unregister(&em_ipt_ops); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eyal Birger "); +MODULE_DESCRIPTION("TC extended match for IPtables matches"); + +module_init(init_em_ipt); +module_exit(exit_em_ipt); + +MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPT); -- cgit v1.2.3-70-g09d2 From f905311356ecb9c88aceeb4fa63ee0a244fc2d72 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 19 Feb 2018 12:10:20 -0600 Subject: rds: send: mark expected switch fall-through in rds_rm_size In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1465362 ("Missing break in switch") Signed-off-by: Gustavo A. R. Silva Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 028ab598ac1b..79d158b3def0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -902,6 +902,8 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs) case RDS_CMSG_ZCOPY_COOKIE: zcopy_cookie = true; + /* fall through */ + case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: cmsg_groups |= 2; -- cgit v1.2.3-70-g09d2 From 0a6b2a1dc2a2105f178255fe495eb914b09cb37a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:47 -0800 Subject: tcp: switch to GSO being always on Oleksandr Natalenko reported performance issues with BBR without FQ packet scheduler that were root caused to lack of SG and GSO/TSO on his configuration. In this mode, TCP internal pacing has to setup a high resolution timer for each MSS sent. We could implement in TCP a strategy similar to the one adopted in commit fefa569a9d4b ("net_sched: sch_fq: account for schedule/timers drifts") or decide to finally switch TCP stack to a GSO only mode. This has many benefits : 1) Most TCP developments are done with TSO in mind. 2) Less high-resolution timers needs to be armed for TCP-pacing 3) GSO can benefit of xmit_more hint 4) Receiver GRO is more effective (as if TSO was used for real on sender) -> Lower ACK traffic 5) Write queues have less overhead (one skb holds about 64KB of payload) 6) SACK coalescing just works. 7) rtx rb-tree contains less packets, SACK is cheaper. This patch implements the minimum patch, but we can remove some legacy code as follow ups. Tested: On 40Gbit link, one netperf -t TCP_STREAM BBR+fq: sg on: 26 Gbits/sec sg off: 15.7 Gbits/sec (was 2.3 Gbit before patch) BBR+pfifo_fast: sg on: 24.2 Gbits/sec sg off: 14.9 Gbits/sec (was 0.66 Gbit before patch !!! ) BBR+fq_codel: sg on: 24.4 Gbits/sec sg off: 15 Gbits/sec (was 0.66 Gbit before patch !!! ) Signed-off-by: Eric Dumazet Reported-by: Oleksandr Natalenko Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 2 +- net/ipv4/tcp.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 3aa7b7d6e6c7..f0f576ff5603 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -417,6 +417,7 @@ struct sock { struct page_frag sk_frag; netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; + netdev_features_t sk_route_forced_caps; int sk_gso_type; unsigned int sk_gso_max_size; gfp_t sk_allocation; diff --git a/net/core/sock.c b/net/core/sock.c index a1fa4a548f1b..507d8c6c4319 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1777,7 +1777,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; + sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; sk->sk_route_caps &= ~sk->sk_route_nocaps; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 48636aee23c3..4b46a2ae46e3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -453,6 +453,7 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; sk_sockets_allocated_inc(sk); + sk->sk_route_forced_caps = NETIF_F_GSO; } EXPORT_SYMBOL(tcp_init_sock); -- cgit v1.2.3-70-g09d2 From 74d4a8f8d378ddbe7caf3331804c3d5a276a9b1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:48 -0800 Subject: tcp: remove sk_can_gso() use After previous commit, sk_can_gso() is always true. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 34 ++++++++-------------------------- net/ipv4/tcp_input.c | 3 --- 2 files changed, 8 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4b46a2ae46e3..6f35c12af85a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -898,7 +898,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; - if (!large_allowed || !sk_can_gso(sk)) + if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ @@ -1103,27 +1103,11 @@ static int linear_payload_sz(bool first_skb) return 0; } -static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc) +static int select_size(bool first_skb, bool zc) { - const struct tcp_sock *tp = tcp_sk(sk); - int tmp = tp->mss_cache; - - if (sg) { - if (zc) - return 0; - - if (sk_can_gso(sk)) { - tmp = linear_payload_sz(first_skb); - } else { - int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); - - if (tmp >= pgbreak && - tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) - tmp = pgbreak; - } - } - - return tmp; + if (zc) + return 0; + return linear_payload_sz(first_skb); } void tcp_free_fastopen_req(struct tcp_sock *tp) @@ -1188,7 +1172,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; bool process_backlog = false; - bool sg, zc = false; + bool zc = false; long timeo; flags = msg->msg_flags; @@ -1269,8 +1253,6 @@ restart: if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; - sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1298,7 +1280,7 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - linear = select_size(sk, sg, first_skb, zc); + linear = select_size(first_skb, zc); skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation, first_skb); if (!skb) @@ -1344,7 +1326,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i >= sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a6b48f6253e3..06b9c4765f42 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1358,9 +1358,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, int len; int in_sack; - if (!sk_can_gso(sk)) - goto fallback; - /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) -- cgit v1.2.3-70-g09d2 From dead7cdb0daec58490891e59f4fae0c5c76fa5f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:49 -0800 Subject: tcp: remove sk_check_csum_caps() Since TCP relies on GSO, we do not need this helper anymore. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 9 --------- net/ipv4/tcp.c | 11 +++-------- 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index f0f576ff5603..b9624581d639 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1863,15 +1863,6 @@ static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) sk->sk_route_caps &= ~flags; } -static inline bool sk_check_csum_caps(struct sock *sk) -{ - return (sk->sk_route_caps & NETIF_F_HW_CSUM) || - (sk->sk_family == PF_INET && - (sk->sk_route_caps & NETIF_F_IP_CSUM)) || - (sk->sk_family == PF_INET6 && - (sk->sk_route_caps & NETIF_F_IPV6_CSUM)); -} - static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6f35c12af85a..7c4140271887 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1063,8 +1063,7 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { - if (!(sk->sk_route_caps & NETIF_F_SG) || - !sk_check_csum_caps(sk)) + if (!(sk->sk_route_caps & NETIF_F_SG)) return sock_no_sendpage_locked(sk, page, offset, size, flags); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ @@ -1190,7 +1189,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG; + zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) uarg->zerocopy = 0; } @@ -1287,11 +1286,7 @@ new_segment: goto wait_for_memory; process_backlog = true; - /* - * Check whether we can use HW checksum. - */ - if (sk_check_csum_caps(sk)) - skb->ip_summed = CHECKSUM_PARTIAL; + skb->ip_summed = CHECKSUM_PARTIAL; skb_entail(sk, skb); copy = size_goal; -- cgit v1.2.3-70-g09d2 From 65ec60973af97dab72094490b4d2e9b8e169456c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:50 -0800 Subject: tcp: tcp_sendmsg() only deals with CHECKSUM_PARTIAL We no longer have skbs with skb->ip_summed == CHECKSUM_NONE in TCP write queues. We can remove dead code in tcp_sendmsg(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7c4140271887..a33539798bf6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1254,14 +1254,10 @@ restart: while (msg_data_left(msg)) { int copy = 0; - int max = size_goal; skb = tcp_write_queue_tail(sk); - if (skb) { - if (skb->ip_summed == CHECKSUM_NONE) - max = mss_now; - copy = max - skb->len; - } + if (skb) + copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1290,7 +1286,6 @@ new_segment: skb_entail(sk, skb); copy = size_goal; - max = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp isn't set to @@ -1374,7 +1369,7 @@ new_segment: goto out; } - if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair)) + if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { -- cgit v1.2.3-70-g09d2 From 4a64fd6ccf127973d1e2b2fc2f8024e550130617 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:51 -0800 Subject: tcp: remove dead code from tcp_set_skb_tso_segs() We no longer have skbs with skb->ip_summed == CHECKSUM_NONE in TCP write queues. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b2bca373f8be..0196923aec42 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1206,7 +1206,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) { - if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) { + if (skb->len <= mss_now) { /* Avoid the costly divide in the normal * non-TSO case. */ -- cgit v1.2.3-70-g09d2 From 98be9b12096fb46773b4a509d3822fd17c82218e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Feb 2018 11:56:52 -0800 Subject: tcp: remove dead code after CHECKSUM_PARTIAL adoption Since all skbs in write/rtx queues have CHECKSUM_PARTIAL, we can remove dead code. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 +++---------- net/ipv4/tcp_output.c | 38 +++++--------------------------------- 2 files changed, 8 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f3e52bc98980..2c6aec2643e8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -561,16 +561,9 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr) { struct tcphdr *th = tcp_hdr(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - } else { - th->check = tcp_v4_check(skb->len, saddr, daddr, - csum_partial(th, - th->doff << 2, - skb->csum)); - } + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); } /* This routine computes an IPv4 TCP checksum. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0196923aec42..8795d76f987c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1335,21 +1335,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; tcp_skb_fragment_eor(skb, buff); - if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { - /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, - skb_put(buff, nsize), - nsize, 0); - - skb_trim(skb, len); - - skb->csum = csum_block_sub(skb->csum, buff->csum, len); - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb_split(skb, buff, len); - } + skb_split(skb, buff, len); - buff->ip_summed = skb->ip_summed; + buff->ip_summed = CHECKSUM_PARTIAL; buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1901,7 +1889,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; + buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2134,7 +2122,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; TCP_SKB_CB(nskb)->sacked = 0; nskb->csum = 0; - nskb->ip_summed = skb->ip_summed; + nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -2142,14 +2130,7 @@ static int tcp_mtu_probe(struct sock *sk) len = 0; tcp_for_write_queue_from_safe(skb, next, sk) { copy = min_t(int, skb->len, probe_size - len); - if (nskb->ip_summed) { - skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); - } else { - __wsum csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), - copy, 0); - nskb->csum = csum_block_add(nskb->csum, csum, len); - } + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); if (skb->len <= copy) { /* We've eaten all the data from this skb. @@ -2166,9 +2147,6 @@ static int tcp_mtu_probe(struct sock *sk) ~(TCPHDR_FIN|TCPHDR_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, - skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(skb, mss_now); @@ -2746,12 +2724,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_replace(sk, next_skb, skb); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; - - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; -- cgit v1.2.3-70-g09d2 From cac56209a66ea3b0be67aa2966b2c628b944da1e Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 20 Feb 2018 08:55:58 -0500 Subject: net: Allow a rule to track originating protocol Allow a rule that is being added/deleted/modified or dumped to contain the originating protocol's id. The protocol is handled just like a routes originating protocol is. This is especially useful because there is starting to be a plethora of different user space programs adding rules. Allow the vrf device to specify that the kernel is the originator of the rule created for this device. Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 + include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 7 ++++++- 4 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 239c78c53e58..951a4b42cb29 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1174,6 +1174,7 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; + frh->proto = RTPROT_KERNEL; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 648caf90ec07..b166ef07e6d4 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -26,7 +26,8 @@ struct fib_rule { u32 table; u8 action; u8 l3mdev; - /* 2 bytes hole, try to use */ + u8 proto; + /* 1 byte hole, try to use */ u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2b642bf9b5a0..925539172d5b 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; + __u8 proto; __u8 res1; /* reserved */ - __u8 res2; /* reserved */ __u8 action; __u32 flags; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index cb071b8e8d17..88298f18cbae 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -51,6 +51,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, r->pref = pref; r->table = table; r->flags = flags; + r->proto = RTPROT_KERNEL; r->fr_net = ops->fro_net; r->uid_range = fib_kuid_range_unset; @@ -465,6 +466,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; + rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); @@ -664,6 +666,9 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { + if (frh->proto && (frh->proto != rule->proto)) + continue; + if (frh->action && (frh->action != rule->action)) continue; @@ -808,9 +813,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; - frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; + frh->proto = rule->proto; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) -- cgit v1.2.3-70-g09d2 From 7299d6f7bfd1921c0cfb5e202155f1a5cfdb57d0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Feb 2018 14:48:39 +0200 Subject: mac80211: support reporting A-MPDU EOF bit value/known Support getting the EOF bit value reported from hardware and writing it out to radiotap. Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 2 ++ include/net/mac80211.h | 5 +++++ net/mac80211/rx.c | 4 ++++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index d91f9e7f4d71..960236fb1681 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -149,6 +149,8 @@ enum ieee80211_radiotap_ampdu_flags { IEEE80211_RADIOTAP_AMPDU_IS_LAST = 0x0008, IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR = 0x0010, IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN = 0x0020, + IEEE80211_RADIOTAP_AMPDU_EOF = 0x0040, + IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN = 0x0080, }; /* for IEEE80211_RADIOTAP_VHT */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 854037b8163e..649f073eb6df 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1099,6 +1099,9 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the first subframe. * @RX_FLAG_ICV_STRIPPED: The ICV is stripped from this frame. CRC checking must * be done in the hardware. + * @RX_FLAG_AMPDU_EOF_BIT: Value of the EOF bit in the A-MPDU delimiter for this + * frame + * @RX_FLAG_AMPDU_EOF_BIT_KNOWN: The EOF value is known */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1125,6 +1128,8 @@ enum mac80211_rx_flags { RX_FLAG_MIC_STRIPPED = BIT(21), RX_FLAG_ALLOW_SAME_PN = BIT(22), RX_FLAG_ICV_STRIPPED = BIT(23), + RX_FLAG_AMPDU_EOF_BIT = BIT(24), + RX_FLAG_AMPDU_EOF_BIT_KNOWN = BIT(25), }; /** diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e755f93ad735..478a9c735edb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -439,6 +439,10 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN; + if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN) + flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN; + if (status->flag & RX_FLAG_AMPDU_EOF_BIT) + flags |= IEEE80211_RADIOTAP_AMPDU_EOF; put_unaligned_le16(flags, pos); pos += 2; if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN) -- cgit v1.2.3-70-g09d2 From a1f2ba04cc92414b6b933289365eab878b0b2bf4 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 19 Feb 2018 14:48:40 +0200 Subject: mac80211: add get TID helper Extracting the TID from the QOS header is common enough to justify helper. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 12 ++++++++++++ net/mac80211/iface.c | 3 +-- net/mac80211/michael.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/rc80211_minstrel_ht.c | 2 +- net/mac80211/rx.c | 6 ++---- net/mac80211/tx.c | 9 ++------- net/mac80211/wpa.c | 8 +++----- 8 files changed, 23 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e4cba332b705..8fe7e4306816 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -8,6 +8,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -2501,6 +2502,17 @@ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) return (u8 *)hdr + 24; } +/** + * ieee80211_get_tid - get qos TID + * @hdr: the frame + */ +static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr) +{ + u8 *qc = ieee80211_get_qos_ctl(hdr); + + return qc[0] & IEEE80211_QOS_CTL_TID_MASK; +} + /** * ieee80211_get_SA - get pointer to SA * @hdr: the frame diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5fe01f82df12..d13ba064951f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1324,8 +1324,7 @@ static void ieee80211_iface_work(struct work_struct *work) mutex_lock(&local->sta_mtx); sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { - u16 tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + u16 tid = ieee80211_get_tid(hdr); __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c index 408649bd4702..37e172701a63 100644 --- a/net/mac80211/michael.c +++ b/net/mac80211/michael.c @@ -35,7 +35,7 @@ static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key, da = ieee80211_get_DA(hdr); sa = ieee80211_get_SA(hdr); if (ieee80211_is_data_qos(hdr->frame_control)) - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); else tid = 0; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 39b660b9a908..010b127a3937 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2151,7 +2151,7 @@ static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata, u16 tx_time) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - u16 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + u16 tid = ieee80211_get_tid(hdr); int ac = ieee80211_ac_from_tid(tid); struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac]; unsigned long now = jiffies; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 4a5bdad9f303..fb586b6e5d49 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -669,7 +669,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE))) return; - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); if (likely(sta->ampdu_mlme.tid_tx[tid])) return; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 478a9c735edb..3dc162ddc3a6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1189,7 +1189,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, ack_policy = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_ACK_POLICY_MASK; - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) { @@ -1528,9 +1528,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) ieee80211_has_pm(hdr->frame_control) && (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control))) { - u8 tid; - - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + u8 tid = ieee80211_get_tid(hdr); ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 5decd0fb247c..7643178ef132 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -797,7 +797,6 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data; - u8 *qc; int tid; /* @@ -844,9 +843,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx) return TX_CONTINUE; /* include per-STA, per-TID sequence counter */ - - qc = ieee80211_get_qos_ctl(hdr); - tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tx->sta->tx_stats.msdu[tid]++; hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid); @@ -1158,7 +1155,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tid; - u8 *qc; memset(tx, 0, sizeof(*tx)); tx->skb = skb; @@ -1198,8 +1194,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata, !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) { struct tid_ampdu_tx *tid_tx; - qc = ieee80211_get_qos_ctl(hdr); - tid = *qc & IEEE80211_QOS_CTL_TID_MASK; + tid = ieee80211_get_tid(hdr); tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]); if (tid_tx) { diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 785056cb76f6..58d0b258b684 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -340,7 +340,7 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad) a4_included = ieee80211_has_a4(hdr->frame_control); if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; @@ -601,8 +601,7 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad) aad[23] = 0; if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; @@ -867,8 +866,7 @@ ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; if (ieee80211_is_data_qos(hdr->frame_control)) - qos_tid = *ieee80211_get_qos_ctl(hdr) & - IEEE80211_QOS_CTL_TID_MASK; + qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; -- cgit v1.2.3-70-g09d2 From 94ba92713f8329c96e0a8e2880b3c1a785d1c95c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 19 Feb 2018 14:48:41 +0200 Subject: mac80211: Call mgd_prep_tx before transmitting deauthentication In multi channel scenarios, when disassociating from the AP before a beacon was heard from the AP, it is not guaranteed that the virtual interface is granted air time for the transmission of the deauthentication frame. This in turn can lead to various issues as the AP might never get the deauthentication frame. To mitigate such possible issues, add a HW flag indicating that the driver requires mac80211 to call the mgd_prep_tx() driver callback to make sure that the virtual interface is granted immediate airtime to be able to transmit the frame, in case that no beacon was heard from the AP. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 +++++++++++++ net/mac80211/debugfs.c | 1 + net/mac80211/mlme.c | 16 +++++++++++++++- 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 649f073eb6df..dc3e9d9c3527 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6,6 +6,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -2069,6 +2070,14 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA: Hardware supports buffer STA on * TDLS links. * + * @IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP: The driver requires the + * mgd_prepare_tx() callback to be called before transmission of a + * deauthentication frame in case the association was completed but no + * beacon was heard. This is required in multi-channel scenarios, where the + * virtual interface might not be given air time for the transmission of + * the frame, as it is not synced with the AP/P2P GO yet, and thus the + * deauthentication frame might not be transmitted. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2112,6 +2121,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_REPORTS_LOW_ACK, IEEE80211_HW_SUPPORTS_TX_FRAG, IEEE80211_HW_SUPPORTS_TDLS_BUFFER_STA, + IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -3356,6 +3366,9 @@ enum ieee80211_reconfig_type { * management frame prior to having successfully associated to allow the * driver to give it channel time for the transmission, to get a response * and to be able to synchronize with the GO. + * For drivers that set %IEEE80211_HW_DEAUTH_NEED_MGD_TX_PREP, mac80211 + * would also call this function before transmitting a deauthentication + * frame in case that no beacon was heard from the AP/P2P GO. * The callback will be called before each transmission and upon return * mac80211 will transmit the frame right away. * The callback is optional and can (should!) sleep. diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 1f466d12a6bc..a75653affbf7 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -212,6 +212,7 @@ static const char *hw_flag_names[] = { FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), + FLAG(DEAUTH_NEED_MGD_TX_PREP), #undef FLAG }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 010b127a3937..0024eff9bb84 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7,6 +7,7 @@ * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -2008,9 +2009,22 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_flush_queues(local, sdata, true); /* deauthenticate/disassociate now */ - if (tx || frame_buf) + if (tx || frame_buf) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + /* + * In multi channel scenarios guarantee that the virtual + * interface is granted immediate airtime to transmit the + * deauthentication frame by calling mgd_prepare_tx, if the + * driver requested so. + */ + if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) && + !ifmgd->have_beacon) + drv_mgd_prepare_tx(sdata->local, sdata); + ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype, reason, tx, frame_buf); + } /* flush out frame - make sure the deauth was actually sent */ if (tx) -- cgit v1.2.3-70-g09d2 From 79a5b9727a1cceacd49921b78425ebda91836bd6 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 22 Feb 2018 13:40:27 -0800 Subject: rds: rds_msg_zcopy should return error of null rm->data.op_mmp_znotifier if either or both of MSG_ZEROCOPY and SOCK_ZEROCOPY have not been specified, the rm->data.op_mmp_znotifier allocation will be skipped. In this case, it is invalid ot pass down a cmsghdr with RDS_CMSG_ZCOPY_COOKIE, so return EINVAL from rds_msg_zcopy for this case. Reported-by: syzbot+f893ae7bb2f6456dfbc3@syzkaller.appspotmail.com Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.") Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 79d158b3def0..acad04243b41 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -941,7 +941,8 @@ static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm, { u32 *cookie; - if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie))) + if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) || + !rm->data.op_mmp_znotifier) return -EINVAL; cookie = CMSG_DATA(cmsg); rm->data.op_mmp_znotifier->z_cookie = *cookie; -- cgit v1.2.3-70-g09d2 From 1b71af6053af1bd2f849e9fda4f71c1e3f145dcf Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Fri, 23 Feb 2018 14:01:52 -0500 Subject: net: fib_rules: Add new attribute to set protocol For ages iproute2 has used `struct rtmsg` as the ancillary header for FIB rules and in the process set the protocol value to RTPROT_BOOT. Until ca56209a66 ("net: Allow a rule to track originating protocol") the kernel rules code ignored the protocol value sent from userspace and always returned 0 in notifications. To avoid incompatibility with existing iproute2, send the protocol as a new attribute. Fixes: cac56209a66 ("net: Allow a rule to track originating protocol") Signed-off-by: Donald Sharp Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++++- include/net/fib_rules.h | 3 ++- include/uapi/linux/fib_rules.h | 5 +++-- net/core/fib_rules.c | 15 +++++++++++---- 4 files changed, 20 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 951a4b42cb29..9ce0182223a0 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1145,6 +1145,7 @@ static inline size_t vrf_fib_rule_nl_size(void) sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ + sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } @@ -1174,7 +1175,9 @@ static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; - frh->proto = RTPROT_KERNEL; + + if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) + goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index b166ef07e6d4..b3d216249240 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -109,7 +109,8 @@ struct fib_rule_notifier_info { [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_L3MDEV] = { .type = NLA_U8 }, \ - [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) } + [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ + [FRA_PROTOCOL] = { .type = NLA_U8 } static inline void fib_rule_get(struct fib_rule *rule) { diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 925539172d5b..77d90ae38114 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -23,8 +23,8 @@ struct fib_rule_hdr { __u8 tos; __u8 table; - __u8 proto; - __u8 res1; /* reserved */ + __u8 res1; /* reserved */ + __u8 res2; /* reserved */ __u8 action; __u32 flags; @@ -58,6 +58,7 @@ enum { FRA_PAD, FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ + FRA_PROTOCOL, /* Originator of the rule */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 88298f18cbae..a6aea805a0a2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -466,11 +466,13 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, } refcount_set(&rule->refcnt, 1); rule->fr_net = net; - rule->proto = frh->proto; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) : fib_default_rule_pref(ops); + rule->proto = tb[FRA_PROTOCOL] ? + nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC; + if (tb[FRA_IIFNAME]) { struct net_device *dev; @@ -666,7 +668,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } list_for_each_entry(rule, &ops->rules_list, list) { - if (frh->proto && (frh->proto != rule->proto)) + if (tb[FRA_PROTOCOL] && + (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) continue; if (frh->action && (frh->action != rule->action)) @@ -786,7 +789,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ - + nla_total_size(sizeof(struct fib_kuid_range)); + + nla_total_size(sizeof(struct fib_kuid_range)) + + nla_total_size(1); /* FRA_PROTOCOL */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -813,9 +817,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen)) goto nla_put_failure; frh->res1 = 0; + frh->res2 = 0; frh->action = rule->action; frh->flags = rule->flags; - frh->proto = rule->proto; + + if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto)) + goto nla_put_failure; if (rule->action == FR_ACT_GOTO && rcu_access_pointer(rule->ctarget) == NULL) -- cgit v1.2.3-70-g09d2 From 7e81c55ee94338e4fe5bdd87e5fb15dbf040156f Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Tue, 27 Feb 2018 17:56:10 +0100 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index f7ba3f96d8f3..69bfedfad174 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.0" +#define BATADV_SOURCE_VERSION "2018.1" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3-70-g09d2 From 6b1aea8cf2c8618146edaf6b35775ab55f7cafe5 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 1 Jan 2018 00:00:00 +0100 Subject: batman-adv: Update copyright years for 2018 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_iv_ogm.h | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v.h | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_elp.h | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bat_v_ogm.h | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/debugfs.c | 2 +- net/batman-adv/debugfs.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/icmp_socket.h | 2 +- net/batman-adv/log.c | 2 +- net/batman-adv/log.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/netlink.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/sysfs.h | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/tp_meter.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/tvlv.h | 2 +- net/batman-adv/types.h | 2 +- 61 files changed, 61 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 5cb360be2a11..daefd7283896 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index ae00c99cbed0..56ae28934070 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c44f6515be5e..e4e2e02b7380 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 022f6e77307b..b97ba6fb8353 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 80c72c7d3cad..ea309ad06175 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 029221615ba3..534b790c3753 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 79e326383726..e21aa147607b 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 9dc0dd5c83df..317cafd302cf 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 27e165ac9302..9c3a34b65b15 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index a17ab68bbce8..ec4a2a569750 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index a83478c46597..28687493599f 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 5e39d0588a48..e8c7b7fd290d 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index ba59b77c605d..2948b41b06d4 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index 6a4c14ccc3c6..ed36c5e79fde 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index bdc1ef06e05b..a296a4d851f5 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index ca9d0753dd6b..48f683289531 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index fad47853ad3c..8ff81346ff0c 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index b27571abcd2f..71f95a3e4d3f 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 21d1189957a7..4229b01ac7b5 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 90a08d35c501..37b069698b04 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 9703c791ffc5..19b15de455ab 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 12897eb46268..e24aa9601c52 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 22dde42fd80e..d815acc13c35 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 138b22a1836a..944512e07782 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 37fe9a644f22..c294f6fd43e0 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 981f58421a32..f0b86fcb2493 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index b3e156af2256..936c107f3199 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index afebd9c7edf4..80afb2793687 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 5f186bff284a..fd4a263dd6b7 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index de5e9a374ece..d1c0f6189301 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 04d964358c98..7b49e4001778 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 4ce1b6d3ad5c..9490a7ca2ba6 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index e91f29c7c638..7d5e9abb7a65 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 84cddd01eeab..958be22beda9 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index dc9fa37ddd14..52d8a4b848c0 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 35e02b2b9e72..35f4f397ed57 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d31c8266e244..69c0d85bceb3 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 69bfedfad174..d5d65999207e 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index cbdeb47ec3f6..6eaffe50335a 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3ac06337ab71..6b8594e23da3 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a823d3899bad..129af56b944d 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 0e7e57b69b54..571d9a5ae7aa 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b48116bb24ef..c3578444f3cb 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index adaeafa4f71e..65c346812bc1 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 58a7d9274435..2a51a0cbb82a 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 8e543a3cdc6c..f3601ab0872e 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b6891e8b741c..289df027ecdd 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index a1289bc5f115..db54c2d9b8bf 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 2a5ab6f1076d..4a35f5c2f52b 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 1e8c79093623..64cce07b8fe6 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 900c5ce21cd4..c95e2b2677fd 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 075c5b5b2ce1..daf87f07fadd 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index c1578fa0b952..f2eef43bd2ec 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index bbeee61221fa..c1e3fb69952d 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 8b576712d0c1..11520de96ccb 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index c8b8f2cb2c2b..68e600974759 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7550a9ccd695..0225616d5771 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 8d9e3abec2c8..01b6c8eafaf9 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 5ffcb45ac6ff..a637458205d1 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index a74df33f446d..ef5867f49824 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index bb1578410e0c..4a3b8837e1b5 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2017 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * -- cgit v1.2.3-70-g09d2 From 08009a760213cf6125af9453a51203f4ae108ba1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 24 Feb 2018 21:20:33 +0300 Subject: net: make kmem caches as __ro_after_init All kmem caches aren't reallocated once set up. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- net/core/skbuff.c | 4 ++-- net/ipv4/fib_trie.c | 5 +++-- net/ipv4/inetpeer.c | 3 ++- net/ipv4/ipmr.c | 3 ++- net/socket.c | 2 +- 6 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 27a55236ad64..690e78c6af45 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -362,7 +362,7 @@ static void dec_net_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_NET_NAMESPACES); } -static struct kmem_cache *net_cachep; +static struct kmem_cache *net_cachep __ro_after_init; static struct workqueue_struct *netns_wq; static struct net *net_alloc(void) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a7485a2cdfa..96d36b81a3a5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -77,8 +77,8 @@ #include #include -struct kmem_cache *skbuff_head_cache __read_mostly; -static struct kmem_cache *skbuff_fclone_cache __read_mostly; +struct kmem_cache *skbuff_head_cache __ro_after_init; +static struct kmem_cache *skbuff_fclone_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5530cd6fdbc7..62243a8abf92 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -50,6 +50,7 @@ #define VERSION "0.409" +#include #include #include #include @@ -191,8 +192,8 @@ static size_t tnode_free_size; */ static const int sync_pages = 128; -static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct kmem_cache *trie_leaf_kmem __read_mostly; +static struct kmem_cache *fn_alias_kmem __ro_after_init; +static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 914d56928578..1f04bd91fc2e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -6,6 +6,7 @@ * Authors: Andrey V. Savochkin */ +#include #include #include #include @@ -51,7 +52,7 @@ * daddr: unchangeable */ -static struct kmem_cache *peer_cachep __read_mostly; +static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7c7ac9d32e77..591d1fc80a1f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -96,7 +97,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); * In this case data path is free of exclusive locks at all. */ -static struct kmem_cache *mrt_cachep __read_mostly; +static struct kmem_cache *mrt_cachep __ro_after_init; static struct mr_table *ipmr_new_table(struct net *net, u32 id); static void ipmr_free_table(struct mr_table *mrt); diff --git a/net/socket.c b/net/socket.c index ab58e57c09ca..645d32b4872c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -233,7 +233,7 @@ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, return __put_user(klen, ulen); } -static struct kmem_cache *sock_inode_cachep __read_mostly; +static struct kmem_cache *sock_inode_cachep __ro_after_init; static struct inode *sock_alloc_inode(struct super_block *sb) { -- cgit v1.2.3-70-g09d2 From f8c3d0dda4b09e05ad8781764cbe153815c1bf23 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 24 Feb 2018 21:21:38 +0300 Subject: xfrm: mark kmem_caches as __ro_after_init Kmem caches aren't relocated once set up. Signed-off-by: Alexey Dobriyan Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 3 ++- net/xfrm/xfrm_policy.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 1472c0857975..44fc54dc013c 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -31,7 +32,7 @@ struct xfrm_trans_cb { #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0])) -static struct kmem_cache *secpath_cachep __read_mostly; +static struct kmem_cache *secpath_cachep __ro_after_init; static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1]; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7a23078132cf..12bd415d349e 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; -static struct kmem_cache *xfrm_dst_cache __read_mostly; +static struct kmem_cache *xfrm_dst_cache __ro_after_init; static __read_mostly seqcount_t xfrm_policy_hash_generation; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); -- cgit v1.2.3-70-g09d2 From 5211fcfb8110f77ff9f389e476563345817f61a5 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 26 Feb 2018 14:28:19 -0800 Subject: esp: check the NETIF_F_HW_ESP_TX_CSUM bit before segmenting If I understand correctly, we should not be asking for a checksum offload on an ipsec packet if the netdev isn't advertising NETIF_F_HW_ESP_TX_CSUM. In that case, we should clear the NETIF_F_CSUM_MASK bits. Signed-off-by: Shannon Nelson Signed-off-by: Steffen Klassert --- net/ipv4/esp4_offload.c | 2 ++ net/ipv6/esp6_offload.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index da5635fc52c2..7cf755ef9efb 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -138,6 +138,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 3fd1ec775dc2..27f59b61f70f 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -165,6 +165,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || (x->xso.dev != skb->dev)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + else if (!(features & NETIF_F_HW_ESP_TX_CSUM)) + esp_features = features & ~NETIF_F_CSUM_MASK; xo->flags |= XFRM_GSO_SEGMENT; -- cgit v1.2.3-70-g09d2 From 59cae5b9d6d39db2804ed5f0330eb5507d40c34c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 23 Feb 2018 10:06:04 +0100 Subject: mac80211: support AP 4-addr mode fast-rx Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3dc162ddc3a6..89dcf9f762ce 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3783,6 +3783,15 @@ void ieee80211_check_fast_rx(struct sta_info *sta) !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta); + + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + sdata->u.vlan.sta) { + fastrx.expected_ds_bits |= + cpu_to_le16(IEEE80211_FCTL_FROMDS); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4); + fastrx.internal_forward = 0; + } + break; default: goto clear; -- cgit v1.2.3-70-g09d2 From 1d870162418a826905161d2276c912986d3b9d9a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 23 Feb 2018 10:06:05 +0100 Subject: mac80211: support fast-rx with incompatible PS capabilities when PS is disabled When powersave is disabled for the interface, we can do fast-rx anyway. Signed-off-by: Felix Fietkau [fixed indentation on one line] Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + net/mac80211/rx.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f4195a0f0279..fd68f6fb02d7 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2685,6 +2685,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); + ieee80211_check_fast_rx_iface(sdata); return 0; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 89dcf9f762ce..1d417960d376 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3750,12 +3750,7 @@ void ieee80211_check_fast_rx(struct sta_info *sta) /* 4-addr is harder to deal with, later maybe */ if (sdata->u.mgd.use_4addr) goto clear; - /* software powersave is a huge mess, avoid all of it */ - if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) - goto clear; - if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && - !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) - goto clear; + if (sta->sta.tdls) { fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); @@ -3767,6 +3762,16 @@ void ieee80211_check_fast_rx(struct sta_info *sta) fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_FROMDS); } + + if (!sdata->u.mgd.powersave) + break; + + /* software powersave is a huge mess, avoid all of it */ + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) + goto clear; + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) + goto clear; break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_AP: -- cgit v1.2.3-70-g09d2 From 9251a4736ee360d3850644ddaaa1a61dcec96238 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 23 Feb 2018 13:55:50 +0100 Subject: mac80211: support station 4-addr mode fast-rx Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1d417960d376..2783c5cd7de7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3747,10 +3747,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta) switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: - /* 4-addr is harder to deal with, later maybe */ - if (sdata->u.mgd.use_4addr) - goto clear; - if (sta->sta.tdls) { fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); @@ -3763,6 +3759,13 @@ void ieee80211_check_fast_rx(struct sta_info *sta) cpu_to_le16(IEEE80211_FCTL_FROMDS); } + if (sdata->u.mgd.use_4addr && !sta->sta.tdls) { + fastrx.expected_ds_bits |= + cpu_to_le16(IEEE80211_FCTL_TODS); + fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4); + } + if (!sdata->u.mgd.powersave) break; -- cgit v1.2.3-70-g09d2 From 21b7022f13fb038b3e204a892c7cc42749754f7f Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 19 Feb 2018 14:48:44 +0200 Subject: mac80211: agg-rx: Accept ADDBA request update if timeout did not change As there is no support for updating an existing ADDBA session with a peer, we decline the request (while keeping the session active). However, in case that the timeout did not change, there is no need to decline the request, so modify the code to reply with status success in such a case (this is useful for interoperability with APs that send an ADDBA request update without changing the timeout value). Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 1f3188d03840..e83c19d4c292 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -298,13 +298,23 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) { if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) { + struct tid_ampdu_rx *tid_rx; + ht_dbg_ratelimited(sta->sdata, "updated AddBA Req from %pM on tid %u\n", sta->sta.addr, tid); /* We have no API to update the timeout value in the - * driver so reject the timeout update. + * driver so reject the timeout update if the timeout + * changed. If if did not change, i.e., no real update, + * just reply with success. */ - status = WLAN_STATUS_REQUEST_DECLINED; + rcu_read_lock(); + tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (tid_rx && tid_rx->timeout == timeout) + status = WLAN_STATUS_SUCCESS; + else + status = WLAN_STATUS_REQUEST_DECLINED; + rcu_read_unlock(); goto end; } -- cgit v1.2.3-70-g09d2 From 84d0a394808621619a4b319c6eed16e3b389e214 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 5 Jan 2018 09:54:32 +0100 Subject: batman-adv: Fix indentation of batadv_seq_before Also multiline macros should have their statements start on a tabstop. This was detected by checkpatch.pl after commit a134f8de9f40 ("checkpatch: improve the TABSTOP test to include declarations"). Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index d5d65999207e..057a28a9fe88 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -331,11 +331,13 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * * Return: true when x is a predecessor of y, false otherwise */ -#define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \ - typeof(y)_d2 = (y); \ - typeof(x)_dummy = (_d1 - _d2); \ - (void)(&_d1 == &_d2); \ - _dummy > batadv_smallest_signed_int(_dummy); }) +#define batadv_seq_before(x, y) ({ \ + typeof(x)_d1 = (x); \ + typeof(y)_d2 = (y); \ + typeof(x)_dummy = (_d1 - _d2); \ + (void)(&_d1 == &_d2); \ + _dummy > batadv_smallest_signed_int(_dummy); \ +}) /** * batadv_seq_after() - Checks if a sequence number x is a successor of y -- cgit v1.2.3-70-g09d2 From d7625f9f72dc148b4f25d9bc5014b710e1024b15 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 20 Feb 2018 12:08:10 +0100 Subject: batman-adv: Avoid relation operator comparison with bool commit 785ea1144182 ("batman-adv: Distributed ARP Table - create DHT helper functions") introduced a return check of batadv_compare_eth which uses a boolean return value since commit 16af73458aca ("batman-adv: main, batadv_compare_eth return bool"). A relational (<, >, <= or >=) operator is not the right one for such a check. Reported-by: David Binderman Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/distributed-arp-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 19b15de455ab..4469dcc1558f 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -495,7 +495,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, * the one with the lowest address */ if (tmp_max == max && max_orig_node && - batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0) + batadv_compare_eth(candidate->orig, max_orig_node->orig)) goto out; ret = true; -- cgit v1.2.3-70-g09d2 From 24bba078eca099b5bd25e17e97b485f013589f8c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 27 Feb 2018 13:03:07 +0100 Subject: mac80211: support A-MSDU in fast-rx Only works if the IV was stripped from packets. Create a smaller variant of ieee80211_rx_h_amsdu, which bypasses checks already done within the fast-rx context. In order to do so, update cfg80211's ieee80211_data_to_8023_exthdr() to take the offset between header and snap. Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++- net/mac80211/rx.c | 124 +++++++++++++++++++++++++++++-------------------- net/wireless/util.c | 5 +- 3 files changed, 80 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 56e905cd4b07..fc40843baed3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4410,10 +4410,12 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr); * of it being pushed into the SKB * @addr: the device MAC address * @iftype: the virtual interface type + * @data_offset: offset of payload after the 802.11 header * Return: 0 on success. Non-zero on error. */ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, - const u8 *addr, enum nl80211_iftype iftype); + const u8 *addr, enum nl80211_iftype iftype, + u8 data_offset); /** * ieee80211_data_to_8023 - convert an 802.11 data frame to 802.3 @@ -4425,7 +4427,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, static inline int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, enum nl80211_iftype iftype) { - return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype); + return ieee80211_data_to_8023_exthdr(skb, NULL, addr, iftype, 0); } /** diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2783c5cd7de7..de7d10732fd5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2353,39 +2353,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) } static ieee80211_rx_result debug_noinline -ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) +__ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) { struct net_device *dev = rx->sdata->dev; struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); struct ethhdr ethhdr; const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source; - if (unlikely(!ieee80211_is_data(fc))) - return RX_CONTINUE; - - if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP_MONITOR; - - if (!(status->rx_flags & IEEE80211_RX_AMSDU)) - return RX_CONTINUE; - if (unlikely(ieee80211_has_a4(hdr->frame_control))) { - switch (rx->sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: - if (!rx->sdata->u.vlan.sta) - return RX_DROP_UNUSABLE; - break; - case NL80211_IFTYPE_STATION: - if (!rx->sdata->u.mgd.use_4addr) - return RX_DROP_UNUSABLE; - break; - default: - return RX_DROP_UNUSABLE; - } check_da = NULL; check_sa = NULL; } else switch (rx->sdata->vif.type) { @@ -2405,15 +2383,13 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) break; } - if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP_UNUSABLE; - skb->dev = dev; __skb_queue_head_init(&frame_list); if (ieee80211_data_to_8023_exthdr(skb, ðhdr, rx->sdata->vif.addr, - rx->sdata->vif.type)) + rx->sdata->vif.type, + data_offset)) return RX_DROP_UNUSABLE; ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr, @@ -2435,6 +2411,44 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_QUEUED; } +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; + __le16 fc = hdr->frame_control; + + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data(fc))) + return RX_CONTINUE; + + if (unlikely(!ieee80211_is_data_present(fc))) + return RX_DROP_MONITOR; + + if (unlikely(ieee80211_has_a4(hdr->frame_control))) { + switch (rx->sdata->vif.type) { + case NL80211_IFTYPE_AP_VLAN: + if (!rx->sdata->u.vlan.sta) + return RX_DROP_UNUSABLE; + break; + case NL80211_IFTYPE_STATION: + if (!rx->sdata->u.mgd.use_4addr) + return RX_DROP_UNUSABLE; + break; + default: + return RX_DROP_UNUSABLE; + } + } + + if (is_multicast_ether_addr(hdr->addr1)) + return RX_DROP_UNUSABLE; + + return __ieee80211_rx_h_amsdu(rx, 0); +} + #ifdef CONFIG_MAC80211_MESH static ieee80211_rx_result ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) @@ -3898,7 +3912,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct sta_info *sta = rx->sta; int orig_len = skb->len; - int snap_offs = ieee80211_hdrlen(hdr->frame_control); + int hdrlen = ieee80211_hdrlen(hdr->frame_control); + int snap_offs = hdrlen; struct { u8 snap[sizeof(rfc1042_header)]; __be16 proto; @@ -3929,10 +3944,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS) return false; - /* we don't deal with A-MSDU deaggregation here */ - if (status->rx_flags & IEEE80211_RX_AMSDU) - return false; - if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return false; @@ -3964,21 +3975,24 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, snap_offs += IEEE80211_CCMP_HDR_LEN; } - if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) - goto drop; - payload = (void *)(skb->data + snap_offs); + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) { + if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) + goto drop; - if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr)) - return false; + payload = (void *)(skb->data + snap_offs); - /* Don't handle these here since they require special code. - * Accept AARP and IPX even though they should come with a - * bridge-tunnel header - but if we get them this way then - * there's little point in discarding them. - */ - if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) || - payload->proto == fast_rx->control_port_protocol)) - return false; + if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr)) + return false; + + /* Don't handle these here since they require special code. + * Accept AARP and IPX even though they should come with a + * bridge-tunnel header - but if we get them this way then + * there's little point in discarding them. + */ + if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) || + payload->proto == fast_rx->control_port_protocol)) + return false; + } /* after this point, don't punt to the slowpath! */ @@ -3992,12 +4006,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, } /* statistics part of ieee80211_rx_h_sta_process() */ - stats->last_rx = jiffies; - stats->last_rate = sta_stats_encode_rate(status); - - stats->fragments++; - stats->packets++; - if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { stats->last_signal = status->signal; if (!fast_rx->uses_rss) @@ -4026,6 +4034,20 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if (rx->key && !ieee80211_has_protected(hdr->frame_control)) goto drop; + if (status->rx_flags & IEEE80211_RX_AMSDU) { + if (__ieee80211_rx_h_amsdu(rx, snap_offs - hdrlen) != + RX_QUEUED) + goto drop; + + return true; + } + + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); + + stats->fragments++; + stats->packets++; + /* do the header conversion - first grab the addresses */ ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); diff --git a/net/wireless/util.c b/net/wireless/util.c index c69160694b6c..d112e9a89364 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -420,7 +420,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr) EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen); int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, - const u8 *addr, enum nl80211_iftype iftype) + const u8 *addr, enum nl80211_iftype iftype, + u8 data_offset) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct { @@ -434,7 +435,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return -1; - hdrlen = ieee80211_hdrlen(hdr->frame_control); + hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; if (skb->len < hdrlen + 8) return -1; -- cgit v1.2.3-70-g09d2 From c80afa026a7f6eb01f5431760cd894526153d4a8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 15:59:19 +0300 Subject: net: Convert /proc creating and destroying pernet_operations These pernet_operations just create and destroy /proc entries, and they can safely marked as async: pppoe_net_ops vlan_net_ops canbcm_pernet_ops kcm_net_ops pfkey_net_ops pppol2tp_net_ops phonet_net_ops Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 1 + net/8021q/vlan.c | 1 + net/can/bcm.c | 1 + net/kcm/kcmproc.c | 1 + net/key/af_key.c | 1 + net/l2tp/l2tp_ppp.c | 1 + net/phonet/pn_dev.c | 1 + 7 files changed, 7 insertions(+) (limited to 'net') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index bd89d1c559ce..c10e6181a2f0 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1161,6 +1161,7 @@ static struct pernet_operations pppoe_net_ops = { .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), + .async = true, }; static int __init pppoe_init(void) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index bad01b14a4ad..bd0ed39f65fb 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -729,6 +729,7 @@ static struct pernet_operations vlan_net_ops = { .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), + .async = true, }; static int __init vlan_proto_init(void) diff --git a/net/can/bcm.c b/net/can/bcm.c index ac5e5e34fee3..26730d39e048 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1717,6 +1717,7 @@ static void canbcm_pernet_exit(struct net *net) static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, + .async = true, }; static int __init bcm_module_init(void) diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 9d5649e4e8b7..2c1c8b3e4452 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -433,6 +433,7 @@ static void kcm_proc_exit_net(struct net *net) static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, + .async = true, }; int __init kcm_proc_init(void) diff --git a/net/key/af_key.c b/net/key/af_key.c index 7e2e7188e7f4..3ac08ab26207 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3863,6 +3863,7 @@ static struct pernet_operations pfkey_net_ops = { .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), + .async = true, }; static void __exit ipsec_pfkey_exit(void) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 99a03c72db4f..0c4f49a6a0cb 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1770,6 +1770,7 @@ static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, .id = &pppol2tp_net_id, + .async = true, }; /***************************************************************************** diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 77787512fc32..9454e8393793 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -342,6 +342,7 @@ static struct pernet_operations phonet_net_ops = { .exit = phonet_exit_net, .id = &phonet_net_id, .size = sizeof(struct phonet_net), + .async = true, }; /* Initialize Phonet devices list */ -- cgit v1.2.3-70-g09d2 From 47d63a01797be8de142beeb0090704501701eafa Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 15:59:28 +0300 Subject: net: Convert hashlimit_net_ops and recent_net_ops These pernet_operations just create and destroy /proc entries. Also, new /proc entries also may come after new nf rules are added, but this is not possible, when net isn't alive. So, they are safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/xt_hashlimit.c | 1 + net/netfilter/xt_recent.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 66f5aca62a08..db2fe0911740 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1345,6 +1345,7 @@ static struct pernet_operations hashlimit_net_ops = { .exit = hashlimit_net_exit, .id = &hashlimit_net_id, .size = sizeof(struct hashlimit_net), + .async = true, }; static int __init hashlimit_mt_init(void) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 6d232d18faff..19efdb757944 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -687,6 +687,7 @@ static struct pernet_operations recent_net_ops = { .exit = recent_net_exit, .id = &recent_net_id, .size = sizeof(struct recent_net), + .async = true, }; static struct xt_match recent_mt_reg[] __read_mostly = { -- cgit v1.2.3-70-g09d2 From f0aad8e340eace8a13736277a0e8c040a879740c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 15:59:37 +0300 Subject: net: Convert synproxy_net_ops These pernet_operations create and destroy /proc entries and allocate extents to template ct, which depend on global nf_ct_ext_types[] array. So, we are able to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_synproxy_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 92139a087260..64b875e452ca 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -398,6 +398,7 @@ static struct pernet_operations synproxy_net_ops = { .exit = synproxy_net_exit, .id = &synproxy_net_id, .size = sizeof(struct synproxy_net), + .async = true, }; static int __init synproxy_core_init(void) -- cgit v1.2.3-70-g09d2 From 02df428ca291f20285f04c25a7efd664a5d83551 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 15:59:56 +0300 Subject: net: Convert simple pernet_operations These pernet_operations make pretty simple actions like variable initialization on init, debug checks on exit, and so on, and they obviously are able to be executed in parallel with any others: vrf_net_ops lockd_net_ops grace_net_ops xfrm6_tunnel_net_ops kcm_net_ops tcf_net_ops Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 + fs/lockd/svc.c | 1 + fs/nfs_common/grace.c | 1 + net/ipv6/xfrm6_tunnel.c | 1 + net/kcm/kcmsock.c | 1 + net/sched/cls_api.c | 1 + 6 files changed, 6 insertions(+) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9ce0182223a0..e459e601c57f 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1434,6 +1434,7 @@ static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .id = &vrf_net_id, .size = sizeof(bool), + .async = true, }; static int __init vrf_init_module(void) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9c36d614bf89..2dee4e03ff1c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -709,6 +709,7 @@ static struct pernet_operations lockd_net_ops = { .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), + .async = true, }; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 5be08f02a76b..8c743a405df6 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -118,6 +118,7 @@ static struct pernet_operations grace_net_ops = { .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), + .async = true, }; static int __init diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f85f0d7480ac..a9673619e0e9 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -353,6 +353,7 @@ static struct pernet_operations xfrm6_tunnel_net_ops = { .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), + .async = true, }; static int __init xfrm6_tunnel_init(void) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 435594648dac..a6cd0712e063 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -2015,6 +2015,7 @@ static struct pernet_operations kcm_net_ops = { .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), + .async = true, }; static int __init kcm_init(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9d1a8bbf8152..19f9f421d5b7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1618,6 +1618,7 @@ static struct pernet_operations tcf_net_ops = { .exit = tcf_net_exit, .id = &tcf_net_id, .size = sizeof(struct tcf_net), + .async = true, }; static int __init tc_filter_init(void) -- cgit v1.2.3-70-g09d2 From 5fcc85843d94e40ac1633d18d9651f69d9f16770 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:00:22 +0300 Subject: net: Convert sysctl creating and destroying pernet_operations These pernet_operations create and destroy sysctl tables, and they are able to be executed in parallel with any others: ip_vs_lblc_ops ip_vs_lblcr_ops Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_lblc.c | 1 + net/netfilter/ipvs/ip_vs_lblcr.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index d625179de485..6a340c94c4b8 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -604,6 +604,7 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { } static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, + .async = true, }; static int __init ip_vs_lblc_init(void) diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 84c57b62a588..0627881128da 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -789,6 +789,7 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, + .async = true, }; static int __init ip_vs_lblcr_init(void) -- cgit v1.2.3-70-g09d2 From 685ecfb19888963f61c6085c17c254dbf665e9da Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:00:31 +0300 Subject: net: Convert tc_action_net_init() and tc_action_net_exit() based pernet_operations These pernet_operations are from net/sched directory, and they call only tc_action_net_init() and tc_action_net_exit(): bpf_net_ops connmark_net_ops csum_net_ops gact_net_ops ife_net_ops ipt_net_ops xt_net_ops mirred_net_ops nat_net_ops pedit_net_ops police_net_ops sample_net_ops simp_net_ops skbedit_net_ops skbmod_net_ops tunnel_key_net_ops vlan_net_ops 1)tc_action_net_init() just allocates and initializes per-net memory. 2)There should not be in-flight packets at the time of tc_action_net_exit() call, or another pernet_operations send packets to dying net (except netlink). So, it seems they can be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 1 + net/sched/act_connmark.c | 1 + net/sched/act_csum.c | 1 + net/sched/act_gact.c | 1 + net/sched/act_ife.c | 1 + net/sched/act_ipt.c | 2 ++ net/sched/act_mirred.c | 1 + net/sched/act_nat.c | 1 + net/sched/act_pedit.c | 1 + net/sched/act_police.c | 1 + net/sched/act_sample.c | 1 + net/sched/act_simple.c | 1 + net/sched/act_skbedit.c | 1 + net/sched/act_skbmod.c | 1 + net/sched/act_tunnel_key.c | 1 + net/sched/act_vlan.c | 1 + 16 files changed, 17 insertions(+) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index cb3c5d403c88..da72e0cf2b1f 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -413,6 +413,7 @@ static struct pernet_operations bpf_net_ops = { .exit_batch = bpf_exit_net, .id = &bpf_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init bpf_init_module(void) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..371e5e4ab3e2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -222,6 +222,7 @@ static struct pernet_operations connmark_net_ops = { .exit_batch = connmark_exit_net, .id = &connmark_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init connmark_init_module(void) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d5c2e528d150..1fb1f1f6a555 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -677,6 +677,7 @@ static struct pernet_operations csum_net_ops = { .exit_batch = csum_exit_net, .id = &csum_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_DESCRIPTION("Checksum updating actions"); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index f072bcf33760..74563254e676 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -247,6 +247,7 @@ static struct pernet_operations gact_net_ops = { .exit_batch = gact_exit_net, .id = &gact_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a5994cf0512b..555b1caeff72 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -870,6 +870,7 @@ static struct pernet_operations ife_net_ops = { .exit_batch = ife_exit_net, .id = &ife_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init ife_init_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 9784629090ad..10866717f88e 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -349,6 +349,7 @@ static struct pernet_operations ipt_net_ops = { .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int tcf_xt_walker(struct net *net, struct sk_buff *skb, @@ -399,6 +400,7 @@ static struct pernet_operations xt_net_ops = { .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..64c86579c3d9 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -353,6 +353,7 @@ static struct pernet_operations mirred_net_ops = { .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..b1bc757f6491 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -323,6 +323,7 @@ static struct pernet_operations nat_net_ops = { .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_DESCRIPTION("Stateless NAT actions"); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 094303c27c5e..5e8cc8f63acd 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -465,6 +465,7 @@ static struct pernet_operations pedit_net_ops = { .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ff55bd6c7db0..51fe4fe343f7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -347,6 +347,7 @@ static struct pernet_operations police_net_ops = { .exit_batch = police_exit_net, .id = &police_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init police_init_module(void) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 9765145aaf40..238dfd27e995 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -248,6 +248,7 @@ static struct pernet_operations sample_net_ops = { .exit_batch = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init sample_init_module(void) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 8244e221fe4f..91816d73f3f3 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -216,6 +216,7 @@ static struct pernet_operations simp_net_ops = { .exit_batch = simp_exit_net, .id = &simp_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index ddf69fc01bdf..7971510fe61b 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -253,6 +253,7 @@ static struct pernet_operations skbedit_net_ops = { .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Alexander Duyck, "); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index a406f191cb84..febec75f4f7a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -278,6 +278,7 @@ static struct pernet_operations skbmod_net_ops = { .exit_batch = skbmod_exit_net, .id = &skbmod_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim, "); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 41ff9d0e5c62..9169b7e78ada 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -337,6 +337,7 @@ static struct pernet_operations tunnel_key_net_ops = { .exit_batch = tunnel_key_exit_net, .id = &tunnel_key_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init tunnel_key_init_module(void) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 71411a255f04..c2ee7fd51cc9 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -313,6 +313,7 @@ static struct pernet_operations vlan_net_ops = { .exit_batch = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct tc_action_net), + .async = true, }; static int __init vlan_init_module(void) -- cgit v1.2.3-70-g09d2 From 3cec5fb3476e07833fea9a1e2d5f6c629078b4ae Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:01:43 +0300 Subject: net: Convert br_net_ops These pernet_operations are similar to bond_net_ops. Exit method unregisters all net bridge devices, and it looks like another pernet_operations are not interested in foreign net bridge list. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/bridge/br.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 6bf06e756df2..7770481a6506 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -188,6 +188,7 @@ static void __net_exit br_net_exit(struct net *net) static struct pernet_operations br_net_ops = { .exit = br_net_exit, + .async = true, }; static const struct stp_proto br_stp_proto = { -- cgit v1.2.3-70-g09d2 From 31502104b301c0dcacd5df8e92fba9dcf20dfccb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:01:52 +0300 Subject: net: Convert ipgre_net_ops, ipgre_tap_net_ops, erspan_net_ops, vti_net_ops and ipip_net_ops These pernet_operations are similar to bond_net_ops. Exit methods unregisters all net ipgre/ipgre_tap/erspan/vti/ipip devices, and it looks like another pernet_operations are not interested in foreign net ipgre/ipgre_tap/erspan/vti/ipip list. Init method also does not intersect with something pernet-specific. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 3 +++ net/ipv4/ip_vti.c | 1 + net/ipv4/ipip.c | 1 + 3 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 45d97e9b2759..e496afa47709 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1044,6 +1044,7 @@ static struct pernet_operations ipgre_net_ops = { .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), + .async = true, }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1623,6 +1624,7 @@ static struct pernet_operations ipgre_tap_net_ops = { .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), + .async = true, }; static int __net_init erspan_init_net(struct net *net) @@ -1641,6 +1643,7 @@ static struct pernet_operations erspan_net_ops = { .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), + .async = true, }; static int __init ipgre_init(void) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 51b1669334fe..b10bf563afd9 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -454,6 +454,7 @@ static struct pernet_operations vti_net_ops = { .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), + .async = true, }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c891235b4966..9c5a4d164f09 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -669,6 +669,7 @@ static struct pernet_operations ipip_net_ops = { .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), + .async = true, }; static int __init ipip_init(void) -- cgit v1.2.3-70-g09d2 From 5c155c50244a0c6e1a7778ae7b4d2753e5e1f617 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:03 +0300 Subject: net: Convert ip6gre_net_ops These pernet_operations are similar to bond_net_ops. Exit method unregisters all net ip6gre devices, and it looks like another pernet_operations are not interested in foreign net ip6gre list or net_generic()->tunnels_wc. Init method registers net device. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3c353125546d..3026662a6413 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1517,6 +1517,7 @@ static struct pernet_operations ip6gre_net_ops = { .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), + .async = true, }; static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], -- cgit v1.2.3-70-g09d2 From 66997ba0834ac1b3f22873d349614405e48c69d7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:11 +0300 Subject: net: Convert ip6_tnl_net_ops These pernet_operations are similar to ip6gre_net_ops. Exit method unregisters all net ip6_tnl tunnels, and it looks like another pernet_operations are not interested in foreign net tunnels list. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4b15fe928278..869e2e6750f7 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2250,6 +2250,7 @@ static struct pernet_operations ip6_tnl_net_ops = { .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), + .async = true, }; /** -- cgit v1.2.3-70-g09d2 From 5ecc29550add41a0f077b07840501d7a3abfc9db Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:19 +0300 Subject: net: Convert vti6_net_ops These pernet_operations are similar to ip6_tnl_net_ops. Exit method unregisters all net vti6 tunnels, and it looks like another pernet_operations are not interested in foreign net vti6 list. Init method registers netdevice. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index fa3ae1cb50d3..c617ea17faa8 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1148,6 +1148,7 @@ static struct pernet_operations vti6_net_ops = { .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), + .async = true, }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { -- cgit v1.2.3-70-g09d2 From 989d9812b7cabbda2e82f47e52dbc48ed4e40ce5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:27 +0300 Subject: net: Convert sit_net_ops These pernet_operations are similar to ip6_tnl_net_ops. Exit method unregisters all net sit devices, and it looks like another pernet_operations are not interested in foreign net sit list. Init method registers netdevice. So, it's possible to mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 3a1775a62973..182db078f01e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1878,6 +1878,7 @@ static struct pernet_operations sit_net_ops = { .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), + .async = true, }; static void __exit sit_cleanup(void) -- cgit v1.2.3-70-g09d2 From f17c9bf07f9cde430e5c566a5383875ffba104cf Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:37 +0300 Subject: net: Convert cfg802154_pernet_ops These pernet_operations have only exit method, which moves devices from cfg802154_rdev_list to init_net. This may occur in any time from nl802154_wpan_phy_netns(), so we are nice with rtnl_lock() synchronization. Signed-off-by: Kirill Tkhai Acked-by: Stefan Schmidt Signed-off-by: David S. Miller --- net/ieee802154/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index cb7176cd4cd6..9104943c15ba 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -345,6 +345,7 @@ static void __net_exit cfg802154_pernet_exit(struct net *net) static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, + .async = true, }; static int __init wpan_phy_class_init(void) -- cgit v1.2.3-70-g09d2 From 7ca9e67febb1441baa0481fbd46745288338afe6 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:02:56 +0300 Subject: net: Convert brnf_net_ops These pernet_operations only unregister nf hooks. So, they are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/bridge/br_netfilter_hooks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 27f1d4f2114a..484f54150525 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -967,6 +967,7 @@ static struct pernet_operations brnf_net_ops __read_mostly = { .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), + .async = true, }; static struct notifier_block brnf_notifier __read_mostly = { -- cgit v1.2.3-70-g09d2 From f95978b7ad0972610b14c65f13f69b3093ff9101 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:03:05 +0300 Subject: net: Convert clusterip_net_ops These pernet_operations register and unregister nf hooks, and populate and destroy /proc entry. So, they are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 4b02ab39ebc5..08b3e48f44fc 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -840,6 +840,7 @@ static struct pernet_operations clusterip_net_ops = { .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), + .async = true, }; static int __init clusterip_tg_init(void) -- cgit v1.2.3-70-g09d2 From e5b2ae93b523304461b2e4ddb59be47f4d4ed1b0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:03:15 +0300 Subject: net: Convert defrag4_net_ops These pernet_operations only unregister nf hooks. So, they are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_defrag_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a0d3ad60a411..57244b62a4fc 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -118,6 +118,7 @@ static void __net_exit defrag4_net_exit(struct net *net) static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, + .async = true, }; static int __init nf_defrag_init(void) -- cgit v1.2.3-70-g09d2 From afd7b3eb13463693250e82d19f8605312bb5c04d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:03:36 +0300 Subject: net: Convert ila_net_ops These pernet_operations register and unregister nf hooks. Also they populate and depopulate ila_net_id-pointed hash table. The table is changed by hooks during skb processing and via netlink request. It looks impossible for another net pernet_operations to force the table reading or writing, so, they are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ila/ila_xlat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 44c39c5f0638..e438699f000f 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -613,6 +613,7 @@ static struct pernet_operations ila_net_ops = { .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), + .async = true, }; static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) -- cgit v1.2.3-70-g09d2 From 9532ce17f7d59d1129165949076491c06f89d1b0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Feb 2018 16:03:45 +0300 Subject: net: Convert defrag6_net_ops These pernet_operations only unregister nf hooks. So, they are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index c87b48359e8f..32f98bc06900 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -103,6 +103,7 @@ static void __net_exit defrag6_net_exit(struct net *net) static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, + .async = true, }; static int __init nf_defrag_init(void) -- cgit v1.2.3-70-g09d2 From 401910db4cd425899832a093539222b6174f92a2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 27 Feb 2018 09:52:43 -0800 Subject: rds: deliver zerocopy completion notification with data This commit is an optimization over commit 01883eda72bd ("rds: support for zcopy completion notification") for PF_RDS sockets. RDS applications are predominantly request-response transactions, so it is more efficient to reduce the number of system calls and have zerocopy completion notification delivered as ancillary data on the POLLIN channel. Cookies are passed up as ancillary data (at level SOL_RDS) in a struct rds_zcopy_cookies when the returned value of recvmsg() is greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed with each message. This commit removes support for zerocopy completion notification on MSG_ERRQUEUE for PF_RDS sockets. Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 2 -- include/uapi/linux/rds.h | 7 +++++++ net/rds/af_rds.c | 7 +++++-- net/rds/message.c | 38 ++++++++++++++++---------------------- net/rds/rds.h | 2 ++ net/rds/recv.c | 31 ++++++++++++++++++++++++++++++- 6 files changed, 60 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 28812eda4209..dc64cfaf13da 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,13 +20,11 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 -#define SO_EE_ORIGIN_ZCOOKIE 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 -#define SO_EE_ORIGIN_MAX_ZCOOKIES 8 /** * struct scm_timestamping - timestamps exposed through cmsg diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 12e3bca32cad..a66b213de3d7 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -104,6 +104,7 @@ #define RDS_CMSG_MASKED_ATOMIC_CSWP 9 #define RDS_CMSG_RXPATH_LATENCY 11 #define RDS_CMSG_ZCOPY_COOKIE 12 +#define RDS_CMSG_ZCOPY_COMPLETION 13 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 @@ -317,6 +318,12 @@ struct rds_rdma_notify { #define RDS_RDMA_DROPPED 3 #define RDS_RDMA_OTHER_ERROR 4 +#define RDS_MAX_ZCOOKIES 8 +struct rds_zcopy_cookies { + __u32 num; + __u32 cookies[RDS_MAX_ZCOOKIES]; +}; + /* * Common set of flags for all RDMA related structs */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index a937f18896ae..f7126108a811 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -77,6 +77,7 @@ static int rds_release(struct socket *sock) rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); + __skb_queue_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); @@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr, * - to signal that a previously congested destination may have become * uncongested * - A notification has been queued to the socket (this can be a congestion - * update, or a RDMA completion). + * update, or a RDMA completion, or a MSG_ZEROCOPY completion). * * EPOLLOUT is asserted if there is room on the send queue. This does not mean * however, that the next sendmsg() call will succeed. If the application tries @@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, spin_unlock(&rs->rs_lock); } if (!list_empty(&rs->rs_recv_queue) || - !list_empty(&rs->rs_notify_queue)) + !list_empty(&rs->rs_notify_queue) || + !skb_queue_empty(&rs->rs_zcookie_queue)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); @@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); + skb_queue_head_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; diff --git a/net/rds/message.c b/net/rds/message.c index 651834513481..116cf87ccb89 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -58,32 +58,26 @@ EXPORT_SYMBOL_GPL(rds_message_addref); static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); - int ncookies; - u32 *ptr; + struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb; + int ncookies = ck->num; - if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE) + if (ncookies == RDS_MAX_ZCOOKIES) return false; - ncookies = serr->ee.ee_data; - if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES) - return false; - ptr = skb_put(skb, sizeof(u32)); - *ptr = cookie; - serr->ee.ee_data = ++ncookies; + ck->cookies[ncookies] = cookie; + ck->num = ++ncookies; return true; } static void rds_rm_zerocopy_callback(struct rds_sock *rs, struct rds_znotifier *znotif) { - struct sock *sk = rds_rs_to_sk(rs); struct sk_buff *skb, *tail; - struct sock_exterr_skb *serr; unsigned long flags; struct sk_buff_head *q; u32 cookie = znotif->z_cookie; + struct rds_zcopy_cookies *ck; - q = &sk->sk_error_queue; + q = &rs->rs_zcookie_queue; spin_lock_irqsave(&q->lock, flags); tail = skb_peek_tail(q); @@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, spin_unlock_irqrestore(&q->lock, flags); mm_unaccount_pinned_pages(&znotif->z_mmp); consume_skb(rds_skb_from_znotifier(znotif)); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ return; } skb = rds_skb_from_znotifier(znotif); - serr = SKB_EXT_ERR(skb); - memset(&serr->ee, 0, sizeof(serr->ee)); - serr->ee.ee_errno = 0; - serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE; - serr->ee.ee_info = 0; + ck = (struct rds_zcopy_cookies *)skb->cb; + memset(ck, 0, sizeof(*ck)); WARN_ON(!skb_zcookie_add(skb, cookie)); __skb_queue_tail(q, skb); spin_unlock_irqrestore(&q->lock, flags); - sk->sk_error_report(sk); + /* caller invokes rds_wake_sk_sleep() */ mm_unaccount_pinned_pages(&znotif->z_mmp); } @@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm) if (rm->data.op_mmp_znotifier) { zcopy = true; rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + rds_wake_sk_sleep(rs); rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); @@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, int total_copied = 0; struct sk_buff *skb; - skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32), - GFP_KERNEL); + skb = alloc_skb(0, GFP_KERNEL); if (!skb) return -ENOMEM; + BUILD_BUG_ON(sizeof(skb->cb) < + max_t(int, sizeof(struct rds_znotifier), + sizeof(struct rds_zcopy_cookies))); rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, length)) { diff --git a/net/rds/rds.h b/net/rds/rds.h index 31cd38852050..33b16353d8f3 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -603,6 +603,8 @@ struct rds_sock { /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; + + struct sk_buff_head rs_zcookie_queue; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) diff --git a/net/rds/recv.c b/net/rds/recv.c index b080961464df..d50747725221 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -577,6 +577,32 @@ out: return ret; } +static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) +{ + struct sk_buff *skb; + struct sk_buff_head *q = &rs->rs_zcookie_queue; + struct rds_zcopy_cookies *done; + + if (!msg->msg_control) + return false; + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || + msg->msg_controllen < CMSG_SPACE(sizeof(*done))) + return false; + + skb = skb_dequeue(q); + if (!skb) + return false; + done = (struct rds_zcopy_cookies *)skb->cb; + if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), + done)) { + skb_queue_head(q, skb); + return false; + } + consume_skb(skb); + return true; +} + int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { @@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (!rds_next_incoming(rs, &inc)) { if (nonblock) { - ret = -EAGAIN; + bool reaped = rds_recvmsg_zcookie(rs, msg); + + ret = reaped ? 0 : -EAGAIN; break; } @@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, ret = -EFAULT; goto out; } + rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); -- cgit v1.2.3-70-g09d2 From d1b2a6c4bed99fc7e8a11e7abcff19293d1974f5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 27 Feb 2018 14:53:37 +0100 Subject: net: GRE: Add is_gretap_dev, is_ip6gretap_dev Determining whether a device is a GRE device is easily done by inspecting struct net_device.type. However, for the tap variants, the type is just ARPHRD_ETHER. Therefore introduce two predicate functions that use netdev_ops to tell the tap devices. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/gre.h | 3 +++ net/ipv4/ip_gre.c | 6 ++++++ net/ipv6/ip6_gre.c | 6 ++++++ 3 files changed, 15 insertions(+) (limited to 'net') diff --git a/include/net/gre.h b/include/net/gre.h index f90585decbce..797142eee9cd 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -37,6 +37,9 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); +bool is_gretap_dev(const struct net_device *dev); +bool is_ip6gretap_dev(const struct net_device *dev); + static inline int gre_calc_hlen(__be16 o_flags) { int addend = 4; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index e496afa47709..0fe1d69b5df4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1323,6 +1323,12 @@ static void ipgre_tap_setup(struct net_device *dev) ip_tunnel_setup(dev, gre_tap_net_id); } +bool is_gretap_dev(const struct net_device *dev) +{ + return dev->netdev_ops == &gre_tap_netdev_ops; +} +EXPORT_SYMBOL_GPL(is_gretap_dev); + static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3026662a6413..4f150a394387 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1785,6 +1785,12 @@ static void ip6gre_tap_setup(struct net_device *dev) netif_keep_dst(dev); } +bool is_ip6gretap_dev(const struct net_device *dev) +{ + return dev->netdev_ops == &ip6gre_tap_netdev_ops; +} +EXPORT_SYMBOL_GPL(is_ip6gretap_dev); + static bool ip6gre_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { -- cgit v1.2.3-70-g09d2 From b0066da52ea53bae2b4ceed3f47d488df27dab66 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 27 Feb 2018 14:53:38 +0100 Subject: ip_tunnel: Rename & publish init_tunnel_flow Initializing struct flowi4 is useful for drivers that need to emulate routing decisions made by a tunnel interface. Publish the function (appropriately renamed) so that the drivers in question don't need to cut'n'paste it around. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 16 ++++++++++++++++ net/ipv4/ip_tunnel.c | 40 ++++++++++++---------------------------- 2 files changed, 28 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 1f16773cfd76..cbe5addb9293 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -254,6 +254,22 @@ static inline __be32 tunnel_id_to_key32(__be64 tun_id) #ifdef CONFIG_INET +static inline void ip_tunnel_init_flow(struct flowi4 *fl4, + int proto, + __be32 daddr, __be32 saddr, + __be32 key, __u8 tos, int oif, + __u32 mark) +{ + memset(fl4, 0, sizeof(*fl4)); + fl4->flowi4_oif = oif; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->flowi4_tos = tos; + fl4->flowi4_proto = proto; + fl4->fl4_gre_key = key; + fl4->flowi4_mark = mark; +} + int ip_tunnel_init(struct net_device *dev); void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d786a8441bce..b2117d89bc83 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -290,22 +290,6 @@ failed: return ERR_PTR(err); } -static inline void init_tunnel_flow(struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif, - __u32 mark) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = proto; - fl4->fl4_gre_key = key; - fl4->flowi4_mark = mark; -} - static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; @@ -322,10 +306,10 @@ static int ip_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; struct rtable *rt; - init_tunnel_flow(&fl4, iph->protocol, iph->daddr, - iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link, - tunnel->fwmark); + ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, + iph->saddr, tunnel->parms.o_key, + RT_TOS(iph->tos), tunnel->parms.link, + tunnel->fwmark); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -581,8 +565,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } - init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); + ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, + RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; rt = ip_route_output_key(tunnel->net, &fl4); @@ -711,14 +695,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } if (tunnel->fwmark) { - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - tunnel->fwmark); + ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link, tunnel->fwmark); } else { - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, - skb->mark); + ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link, skb->mark); } if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) -- cgit v1.2.3-70-g09d2 From 82695b30ffeeab665f41416c6f5015dea3147bd5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 27 Feb 2018 15:48:21 -0800 Subject: inet: whitespace cleanup Ran simple script to find/remove trailing whitespace and blank lines at EOF because that kind of stuff git whines about and editors leave behind. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/ethoc.h | 1 - include/net/flow.h | 2 +- include/net/inet_connection_sock.h | 10 +++++----- include/net/ip.h | 12 ++++++------ include/net/ip_fib.h | 2 +- include/net/ipv6.h | 10 +++++----- include/net/xfrm.h | 14 +++++++------- net/ipv4/fib_semantics.c | 2 +- net/ipv4/proc.c | 1 - net/ipv4/tunnel4.c | 2 +- net/ipv4/xfrm4_policy.c | 1 - net/ipv6/anycast.c | 1 - net/ipv6/exthdrs_core.c | 1 - net/ipv6/ipv6_sockglue.c | 1 - net/ipv6/proc.c | 1 - net/ipv6/xfrm6_state.c | 1 - 16 files changed, 27 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/ethoc.h b/include/net/ethoc.h index bb7f467da7fc..29ba069a1d93 100644 --- a/include/net/ethoc.h +++ b/include/net/ethoc.h @@ -21,4 +21,3 @@ struct ethoc_platform_data { }; #endif /* !LINUX_NET_ETHOC_H */ - diff --git a/include/net/flow.h b/include/net/flow.h index f1624fd5b1d0..64e7ee9cb980 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -125,7 +125,7 @@ static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, fl4->daddr = daddr; fl4->saddr = saddr; } - + struct flowi6 { struct flowi_common __fl_common; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c1a93ce35e62..b68fea022a82 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -49,9 +49,9 @@ struct inet_connection_sock_af_ops { u16 net_header_len; u16 net_frag_header_len; u16 sockaddr_len; - int (*setsockopt)(struct sock *sk, int level, int optname, + int (*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); - int (*getsockopt)(struct sock *sk, int level, int optname, + int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, @@ -67,7 +67,7 @@ struct inet_connection_sock_af_ops { /** inet_connection_sock - INET connection oriented sock * - * @icsk_accept_queue: FIFO of established children + * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) @@ -122,7 +122,7 @@ struct inet_connection_sock { unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ - __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ } icsk_ack; struct { int enabled; @@ -201,7 +201,7 @@ extern const char inet_csk_timer_bug_msg[]; static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { struct inet_connection_sock *icsk = inet_csk(sk); - + if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { icsk->icsk_pending = 0; #ifdef INET_CSK_CLEAR_TIMERS diff --git a/include/net/ip.h b/include/net/ip.h index 746abff9ce51..fe63ba95d12b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -186,15 +186,15 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); struct ip_reply_arg { - struct kvec iov[1]; + struct kvec iov[1]; int flags; __wsum csum; int csumoffset; /* u16 offset of csum in iov[0].iov_base */ - /* -1 if not needed */ + /* -1 if not needed */ int bound_dev_if; u8 tos; kuid_t uid; -}; +}; #define IP_REPLY_ARG_NOSRCCHECK 1 @@ -577,13 +577,13 @@ int ip_frag_mem(struct net *net); /* * Functions provided by ip_forward.c */ - + int ip_forward(struct sk_buff *skb); - + /* * Functions provided by ip_options.c */ - + void ip_options_build(struct sk_buff *skb, struct ip_options *opt, __be32 daddr, struct rtable *rt, int is_frag); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f80524396c06..15e19c5c6f26 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -157,7 +157,7 @@ struct fib_result_nl { unsigned char nh_sel; unsigned char type; unsigned char scope; - int err; + int err; }; #ifdef CONFIG_IP_ROUTE_MULTIPATH diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7a98cd583c73..cabd3cdd4015 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -105,8 +105,8 @@ #define IPV6_ADDR_ANY 0x0000U -#define IPV6_ADDR_UNICAST 0x0001U -#define IPV6_ADDR_MULTICAST 0x0002U +#define IPV6_ADDR_UNICAST 0x0001U +#define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U @@ -447,7 +447,7 @@ ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, #endif } -static inline void ipv6_addr_prefix(struct in6_addr *pfx, +static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { @@ -496,7 +496,7 @@ static inline void __ipv6_addr_set_half(__be32 *addr, addr[1] = wl; } -static inline void ipv6_addr_set(struct in6_addr *addr, +static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { @@ -732,7 +732,7 @@ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int } /* - * we should *never* get to this point since that + * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7d2077665c0b..aa027ba1d032 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1267,12 +1267,12 @@ static inline void xfrm_sk_free_policy(struct sock *sk) static inline void xfrm_sk_free_policy(struct sock *sk) {} static inline int xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { return 0; } -static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } -static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } +static inline int xfrm6_route_forward(struct sk_buff *skb) { return 1; } +static inline int xfrm4_route_forward(struct sk_buff *skb) { return 1; } static inline int xfrm6_policy_check(struct sock *sk, int dir, struct sk_buff *skb) -{ - return 1; -} +{ + return 1; +} static inline int xfrm4_policy_check(struct sock *sk, int dir, struct sk_buff *skb) { return 1; @@ -1356,7 +1356,7 @@ __xfrm6_state_addr_check(const struct xfrm_state *x, { if (ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)&x->id.daddr) && (ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)&x->props.saddr) || - ipv6_addr_any((struct in6_addr *)saddr) || + ipv6_addr_any((struct in6_addr *)saddr) || ipv6_addr_any((struct in6_addr *)&x->props.saddr))) return 1; return 0; @@ -1666,7 +1666,7 @@ int xfrm_user_policy(struct sock *sk, int optname, static inline int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) { return -ENOPROTOOPT; -} +} static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index cd46d7666598..f31e6575ab91 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -171,7 +171,7 @@ static void free_nh_exceptions(struct fib_nh *nh) fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; - + next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index fdabc70283b6..d97e83b2dd33 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -556,4 +556,3 @@ int __init ip_misc_proc_init(void) { return register_pernet_subsys(&ip_proc_ops); } - diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index ec35eaa5c029..c0630013c1ae 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL(xfrm4_tunnel_deregister); for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ - + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 796ac4115485..0c752dc3f93b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -379,4 +379,3 @@ void __init xfrm4_init(void) xfrm4_protocol_init(); register_pernet_subsys(&xfrm4_net_ops); } - diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 8e085cc05aeb..d7d0abc7fd0e 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -552,4 +552,3 @@ void ac6_proc_exit(struct net *net) remove_proc_entry("anycast6", net->proc_net); } #endif - diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 11025f8d124b..b643f5ce6c80 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -279,4 +279,3 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr); - diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 24535169663d..4d780c7f0130 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1415,4 +1415,3 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, } EXPORT_SYMBOL(compat_ipv6_getsockopt); #endif - diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index b8858c546f41..1678cf037688 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -355,4 +355,3 @@ void ipv6_misc_proc_exit(void) { unregister_pernet_subsys(&ipv6_proc_ops); } - diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index b15075a5c227..16f434791763 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -196,4 +196,3 @@ void xfrm6_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); } - -- cgit v1.2.3-70-g09d2 From fd5ac14a1aae63ef95b22cc2eb23e3a25b3436be Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 28 Feb 2018 10:45:03 +0100 Subject: net: sch: Don't warn on missmatching qlen and backlog for offloaded qdiscs Offloaded qdiscs are allowed to expose only parts of their statistics. It means that if backlog is being exposed and qlen is not, it might trigger a warning in qdisc_tree_reduce_backlog. Do not warn in case the qdisc that was removed was an offloaded one. Signed-off-by: Nogah Frankel Reviewed-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/sch_api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 27e672c12492..68f9d942bed4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -739,6 +739,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, unsigned int len) { + bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; @@ -760,8 +761,12 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n, * If child was empty even before update then backlog * counter is screwed and we skip notification because * parent class is already passive. + * + * If the original child was offloaded then it is allowed + * to be seem as empty, so the parent is notified anyway. */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n); + notify = !sch->q.qlen && !WARN_ON_ONCE(!n && + !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { -- cgit v1.2.3-70-g09d2 From 98ceb7b6d64552f995973be1a0ee9af0bf85fb3d Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 28 Feb 2018 10:45:05 +0100 Subject: mlxsw: spectrum: qdiscs: prio: Delete child qdiscs when removing bands When the number the bands of sch_prio is decreased, child qdiscs on the deleted bands would get deleted as well. This change and deletions are being done under sch_tree_lock of the sch_prio qdisc. Part of the destruction of qdisc is unoffloading it, if it is offloaded. Un-offloading can't be done inside this lock. Move the offload command to be done before reducing the number of bands, so unoffloading of the qdiscs that are about to be deleted could be done outside of the lock. Signed-off-by: Nogah Frankel Reviewed-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c | 7 ++++++- net/sched/sch_prio.c | 13 ++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c index bed7495d35d6..a2a3ac09c3bc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c @@ -547,7 +547,12 @@ mlxsw_sp_qdisc_prio_replace(struct mlxsw_sp_port *mlxsw_sp_port, child_qdisc->stats_base.backlog = backlog; } } - + for (; band < IEEE_8021QAZ_MAX_TCS; band++) { + tclass = MLXSW_SP_PRIO_BAND_TO_TCLASS(band); + child_qdisc = &mlxsw_sp_port->tclass_qdiscs[tclass]; + child_qdisc->prio_bitmap = 0; + mlxsw_sp_qdisc_destroy(mlxsw_sp_port, child_qdisc); + } return 0; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index efbf51f35778..ba2d6d17d95a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -142,9 +142,8 @@ prio_reset(struct Qdisc *sch) sch->q.qlen = 0; } -static int prio_offload(struct Qdisc *sch, bool enable) +static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { - struct prio_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, @@ -154,10 +153,10 @@ static int prio_offload(struct Qdisc *sch, bool enable) if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; - if (enable) { + if (qopt) { opt.command = TC_PRIO_REPLACE; - opt.replace_params.bands = q->bands; - memcpy(&opt.replace_params.priomap, q->prio2band, + opt.replace_params.bands = qopt->bands; + memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { @@ -174,7 +173,7 @@ prio_destroy(struct Qdisc *sch) struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); - prio_offload(sch, false); + prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -211,6 +210,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } } + prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); @@ -230,7 +230,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, } sch_tree_unlock(sch); - prio_offload(sch, true); return 0; } -- cgit v1.2.3-70-g09d2 From b9c7a7acc749f3d0667a2ab44ea38110d5a1f286 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Wed, 28 Feb 2018 10:45:06 +0100 Subject: net: sch: prio: Add offload ability for grafting a child Offload sch_prio graft command for capable drivers. Warn in case of a failure, unless the graft was done as part of a destroy operation (the new qdisc is a noop) or if all the qdiscs (the parent, the old child, and the new one) are not offloaded. Signed-off-by: Nogah Frankel Reviewed-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 8 ++++++++ net/sched/sch_prio.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 87406252f0a3..e828d31be5da 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -806,6 +806,7 @@ enum tc_prio_command { TC_PRIO_REPLACE, TC_PRIO_DESTROY, TC_PRIO_STATS, + TC_PRIO_GRAFT, }; struct tc_prio_qopt_offload_params { @@ -818,6 +819,11 @@ struct tc_prio_qopt_offload_params { struct gnet_stats_queue *qstats; }; +struct tc_prio_qopt_offload_graft_params { + u8 band; + u32 child_handle; +}; + struct tc_prio_qopt_offload { enum tc_prio_command command; u32 handle; @@ -825,6 +831,8 @@ struct tc_prio_qopt_offload { union { struct tc_prio_qopt_offload_params replace_params; struct tc_qopt_offload_stats stats; + struct tc_prio_qopt_offload_graft_params graft_params; }; }; + #endif diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index ba2d6d17d95a..222e53d3d27a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -308,12 +308,44 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); + struct tc_prio_qopt_offload graft_offload; + struct net_device *dev = qdisc_dev(sch); unsigned long band = arg - 1; + bool any_qdisc_is_offloaded; + int err; if (new == NULL) new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->queues[band]); + + if (!tc_can_offload(dev)) + return 0; + + graft_offload.handle = sch->handle; + graft_offload.parent = sch->parent; + graft_offload.graft_params.band = band; + graft_offload.graft_params.child_handle = new->handle; + graft_offload.command = TC_PRIO_GRAFT; + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, + &graft_offload); + + /* Don't report error if the graft is part of destroy operation. */ + if (err && new != &noop_qdisc) { + /* Don't report error if the parent, the old child and the new + * one are not offloaded. + */ + any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; + any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED; + if (*old) + any_qdisc_is_offloaded |= (*old)->flags & + TCQ_F_OFFLOADED; + + if (any_qdisc_is_offloaded) + NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); + } + return 0; } -- cgit v1.2.3-70-g09d2 From bfff4862653bb96001ab57c1edd6d03f48e5f035 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:40:16 -0500 Subject: net: fib_rules: support for match on ip_proto, sport and dport uapi for ip_proto, sport and dport range match in fib rules. Signed-off-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/fib_rules.h | 36 ++++++++++++++++- include/uapi/linux/fib_rules.h | 8 ++++ net/core/fib_rules.c | 92 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 133 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index b3d216249240..6dd0a00653ae 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -27,7 +27,7 @@ struct fib_rule { u8 action; u8 l3mdev; u8 proto; - /* 1 byte hole, try to use */ + u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; @@ -40,6 +40,8 @@ struct fib_rule { char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; + struct fib_rule_port_range sport_range; + struct fib_rule_port_range dport_range; struct rcu_head rcu; }; @@ -144,6 +146,38 @@ static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) return frh->table; } +static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) +{ + return range->start != 0 && range->end != 0; +} + +static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, + __be16 port) +{ + return ntohs(port) >= a->start && + ntohs(port) <= a->end; +} + +static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) +{ + return a->start != 0 && a->end != 0 && a->end < 0xffff && + a->start <= a->end; +} + +static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, + struct fib_rule_port_range *b) +{ + return a->start == b->start && + a->end == b->end; +} + +static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) +{ + return rule->ip_proto || + fib_rule_port_range_set(&rule->sport_range) || + fib_rule_port_range_set(&rule->dport_range); +} + struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 77d90ae38114..232df14e1287 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -35,6 +35,11 @@ struct fib_rule_uid_range { __u32 end; }; +struct fib_rule_port_range { + __u16 start; + __u16 end; +}; + enum { FRA_UNSPEC, FRA_DST, /* destination address */ @@ -59,6 +64,9 @@ enum { FRA_L3MDEV, /* iif or oif is l3mdev goto its table */ FRA_UID_RANGE, /* UID range */ FRA_PROTOCOL, /* Originator of the rule */ + FRA_IP_PROTO, /* ip proto */ + FRA_SPORT_RANGE, /* sport */ + FRA_DPORT_RANGE, /* dport */ __FRA_MAX }; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a6aea805a0a2..f6f04fc0f629 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule) if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) return false; + if (fib_rule_port_range_set(&rule->sport_range)) + return false; + if (fib_rule_port_range_set(&rule->dport_range)) + return false; return true; } EXPORT_SYMBOL_GPL(fib_rule_matchall); @@ -221,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range) return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out); } +static int nla_get_port_range(struct nlattr *pattr, + struct fib_rule_port_range *port_range) +{ + const struct fib_rule_port_range *pr = nla_data(pattr); + + if (!fib_rule_port_range_valid(pr)) + return -EINVAL; + + port_range->start = pr->start; + port_range->end = pr->end; + + return 0; +} + +static int nla_put_port_range(struct sk_buff *skb, int attrtype, + struct fib_rule_port_range *range) +{ + return nla_put(skb, attrtype, sizeof(*range), range); +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) @@ -425,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, !uid_eq(r->uid_range.end, rule->uid_range.end)) continue; + if (r->ip_proto != rule->ip_proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -569,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, rule->uid_range = fib_kuid_range_unset; } + if (tb[FRA_IP_PROTO]) + rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]); + + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &rule->sport_range); + if (err) + goto errout_free; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &rule->dport_range); + if (err) + goto errout_free; + } + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -634,6 +686,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); + struct fib_rule_port_range sprange = {0, 0}; + struct fib_rule_port_range dprange = {0, 0}; struct fib_rules_ops *ops = NULL; struct fib_rule *rule, *r; struct nlattr *tb[FRA_MAX+1]; @@ -667,6 +721,20 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, range = fib_kuid_range_unset; } + if (tb[FRA_SPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_SPORT_RANGE], + &sprange); + if (err) + goto errout; + } + + if (tb[FRA_DPORT_RANGE]) { + err = nla_get_port_range(tb[FRA_DPORT_RANGE], + &dprange); + if (err) + goto errout; + } + list_for_each_entry(rule, &ops->rules_list, list) { if (tb[FRA_PROTOCOL] && (rule->proto != nla_get_u8(tb[FRA_PROTOCOL]))) @@ -712,6 +780,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, !uid_eq(rule->uid_range.end, range.end))) continue; + if (tb[FRA_IP_PROTO] && + (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO]))) + continue; + + if (fib_rule_port_range_set(&sprange) && + !fib_rule_port_range_compare(&rule->sport_range, &sprange)) + continue; + + if (fib_rule_port_range_set(&dprange) && + !fib_rule_port_range_compare(&rule->dport_range, &dprange)) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -790,7 +870,10 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_FWMASK */ + nla_total_size_64bit(8) /* FRA_TUN_ID */ + nla_total_size(sizeof(struct fib_kuid_range)) - + nla_total_size(1); /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_PROTOCOL */ + + nla_total_size(1) /* FRA_IP_PROTO */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -855,7 +938,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->l3mdev && nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) || (uid_range_set(&rule->uid_range) && - nla_put_uid_range(skb, &rule->uid_range))) + nla_put_uid_range(skb, &rule->uid_range)) || + (fib_rule_port_range_set(&rule->sport_range) && + nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (fib_rule_port_range_set(&rule->dport_range) && + nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { -- cgit v1.2.3-70-g09d2 From 4a2d73a4fb36ed4d73967bd3892cfab014a52ab2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:41:06 -0500 Subject: ipv4: fib_rules: support match on sport, dport and ip proto support to match on src port, dst port and ip protocol. Signed-off-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/fib_rules.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 35d646a62ad4..16083b82954b 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -182,6 +182,17 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; + if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) + return 0; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + return 0; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + return 0; + return 1; } -- cgit v1.2.3-70-g09d2 From bb0ad1987e963e47a02cc53b8275ffe2b38d4b70 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:41:37 -0500 Subject: ipv6: fib6_rules: support for match on sport, dport and ip proto support to match on src port, dst port and ip protocol. Signed-off-by: Roopa Prabhu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 95a2c9e8699a..bcd1f22ac7b1 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -223,6 +223,17 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel)) return 0; + if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) + return 0; + + if (fib_rule_port_range_set(&rule->sport_range) && + !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + return 0; + + if (fib_rule_port_range_set(&rule->dport_range) && + !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + return 0; + return 1; } -- cgit v1.2.3-70-g09d2 From e37b1e978bec5334dc379d8c2423d063af207430 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:42:41 -0500 Subject: ipv6: route: dissect flow in input path if fib rules need it Dissect flow in fwd path if fib rules require it. Controlled by a flag to avoid penatly for the common case. Flag is set when fib rules with sport, dport and proto match that require flow dissect are installed. Also passes the dissected hash keys to the multipath hash function when applicable to avoid dissecting the flow again. icmp packets will continue to use inner header for hash calculations (Thanks to Nikolay Aleksandrov for some review here). Signed-off-by: Roopa Prabhu Acked-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 27 ++++++++++++++++++++++++++- include/net/netns/ipv4.h | 1 + net/ipv4/fib_rules.c | 8 ++++++++ net/ipv4/fib_semantics.c | 2 +- net/ipv4/route.c | 43 +++++++++++++++++++++++++++++-------------- 5 files changed, 65 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 15e19c5c6f26..8812582a94d5 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -293,6 +293,13 @@ static inline unsigned int fib4_rules_seq_read(struct net *net) return 0; } +static inline bool fib4_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi4 *fl4, + struct flow_keys *flkeys) +{ + return false; +} #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); @@ -341,6 +348,24 @@ bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb); unsigned int fib4_rules_seq_read(struct net *net); +static inline bool fib4_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi4 *fl4, + struct flow_keys *flkeys) +{ + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + + if (!net->ipv4.fib_rules_require_fldissect) + return false; + + skb_flow_dissect_flow_keys(skb, flkeys, flag); + fl4->fl4_sport = flkeys->ports.src; + fl4->fl4_dport = flkeys->ports.dst; + fl4->flowi4_proto = flkeys->basic.ip_proto; + + return true; +} + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -371,7 +396,7 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags); #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, - const struct sk_buff *skb); + const struct sk_buff *skb, struct flow_keys *flkeys); #endif void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 44668c29701a..3a970e429ab6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -52,6 +52,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; + unsigned int fib_rules_require_fldissect; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; #endif diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 16083b82954b..737d11bc8838 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -255,6 +255,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } #endif + if (fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect++; + rule4->src_len = frh->src_len; rule4->srcmask = inet_make_mask(rule4->src_len); rule4->dst_len = frh->dst_len; @@ -283,6 +286,10 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + + if (net->ipv4.fib_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv4.fib_rules_require_fldissect--; errout: return err; } @@ -400,6 +407,7 @@ int __net_init fib4_rules_init(struct net *net) goto fail; net->ipv4.rules_ops = ops; net->ipv4.fib_has_custom_rules = false; + net->ipv4.fib_rules_require_fldissect = 0; return 0; fail: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f31e6575ab91..181b0d8d589c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, fl4, skb); + int h = fib_multipath_hash(res->fi, fl4, skb, NULL); fib_select_multipath(res, h); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 26eefa2eaa44..3bb686dac273 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1783,7 +1783,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, - const struct sk_buff *skb) + const struct sk_buff *skb, struct flow_keys *flkeys) { struct net *net = fi->fib_net; struct flow_keys hash_keys; @@ -1810,14 +1810,23 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); - skb_flow_dissect_flow_keys(skb, &keys, flag); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; - hash_keys.ports.src = keys.ports.src; - hash_keys.ports.dst = keys.ports.dst; - hash_keys.basic.ip_proto = keys.basic.ip_proto; + if (flkeys) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + skb_flow_dissect_flow_keys(skb, &keys, flag); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + hash_keys.ports.src = keys.ports.src; + hash_keys.ports.dst = keys.ports.dst; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -1838,11 +1847,12 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) + __be32 daddr, __be32 saddr, u32 tos, + struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, NULL, skb); + int h = fib_multipath_hash(res->fi, NULL, skb, hkeys); fib_select_multipath(res, h); } @@ -1868,13 +1878,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct fib_result *res) { struct in_device *in_dev = __in_dev_get_rcu(dev); + struct flow_keys *flkeys = NULL, _flkeys; + struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; - struct flowi4 fl4; + int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; - int err = -EINVAL; - struct net *net = dev_net(dev); + struct flowi4 fl4; bool do_cache; /* IP on this device is disabled. */ @@ -1933,6 +1944,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); + + if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) + flkeys = &_flkeys; + err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) @@ -1958,7 +1973,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); out: return err; brd_input: -- cgit v1.2.3-70-g09d2 From 5e5d6fed374155ba1a7a5ca5f12fbec2285d06a2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Feb 2018 22:43:22 -0500 Subject: ipv6: route: dissect flow in input path if fib rules need it Dissect flow in fwd path if fib rules require it. Controlled by a flag to avoid penatly for the common case. Flag is set when fib rules with sport, dport and proto match that require flow dissect are installed. Also passes the dissected hash keys to the multipath hash function when applicable to avoid dissecting the flow again. icmp packets will continue to use inner header for hash calculations. Signed-off-by: Roopa Prabhu Acked-by: Paolo Abeni Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 25 +++++++++++++++++++++++++ include/net/ip6_route.h | 4 +++- include/net/netns/ipv6.h | 3 ++- net/ipv6/fib6_rules.c | 16 ++++++++++++++++ net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 34 +++++++++++++++++++++++++--------- 6 files changed, 72 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 34ec321d6a03..8d906a35b534 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -415,6 +415,24 @@ void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb); unsigned int fib6_rules_seq_read(struct net *net); + +static inline bool fib6_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi6 *fl6, + struct flow_keys *flkeys) +{ + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + + if (!net->ipv6.fib6_rules_require_fldissect) + return false; + + skb_flow_dissect_flow_keys(skb, flkeys, flag); + fl6->fl6_sport = flkeys->ports.src; + fl6->fl6_dport = flkeys->ports.dst; + fl6->flowi6_proto = flkeys->basic.ip_proto; + + return true; +} #else static inline int fib6_rules_init(void) { @@ -436,5 +454,12 @@ static inline unsigned int fib6_rules_seq_read(struct net *net) { return 0; } +static inline bool fib6_rules_early_flow_dissect(struct net *net, + struct sk_buff *skb, + struct flowi6 *fl6, + struct flow_keys *flkeys) +{ + return false; +} #endif #endif diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 27d23a65f3cd..da2bde5fda8f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb); +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, + struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); @@ -266,4 +267,5 @@ static inline bool rt6_duplicate_nexthop(struct rt6_info *a, struct rt6_info *b) ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) && !lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate); } + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 987cc4569cb8..2b9194229a56 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -71,7 +71,8 @@ struct netns_ipv6 { unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; + bool fib6_has_custom_rules; struct rt6_info *ip6_prohibit_entry; struct rt6_info *ip6_blk_hole_entry; struct fib6_table *fib6_local_tbl; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcd1f22ac7b1..04e5f523e50f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -269,12 +269,26 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule6->dst.plen = frh->dst_len; rule6->tclass = frh->tos; + if (fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect++; + net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } +static int fib6_rule_delete(struct fib_rule *rule) +{ + struct net *net = rule->fr_net; + + if (net->ipv6.fib6_rules_require_fldissect && + fib_rule_requires_fldissect(rule)) + net->ipv6.fib6_rules_require_fldissect--; + + return 0; +} + static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { @@ -334,6 +348,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, + .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, @@ -361,6 +376,7 @@ static int __net_init fib6_rules_net_init(struct net *net) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; + net->ipv6.fib6_rules_require_fldissect = 0; out: return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4fa4f1b150a4..b0778d323b6e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa709b644945..e2bb40824c85 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -460,7 +460,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL); + fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -1786,10 +1786,12 @@ struct dst_entry *ip6_route_input_lookup(struct net *net, EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, - struct flow_keys *keys) + struct flow_keys *keys, + struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; + struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; @@ -1811,22 +1813,31 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, goto out; key_iph = inner_iph; + _flkeys = NULL; out: memset(keys, 0, sizeof(*keys)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys->addrs.v6addrs.src = key_iph->saddr; - keys->addrs.v6addrs.dst = key_iph->daddr; - keys->tags.flow_label = ip6_flowinfo(key_iph); - keys->basic.ip_proto = key_iph->nexthdr; + if (_flkeys) { + keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; + keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; + keys->tags.flow_label = _flkeys->tags.flow_label; + keys->basic.ip_proto = _flkeys->basic.ip_proto; + } else { + keys->addrs.v6addrs.src = key_iph->saddr; + keys->addrs.v6addrs.dst = key_iph->daddr; + keys->tags.flow_label = ip6_flowinfo(key_iph); + keys->basic.ip_proto = key_iph->nexthdr; + } } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) +u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, + struct flow_keys *flkeys) { struct flow_keys hash_keys; if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys); + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); return flow_hash_from_keys(&hash_keys) >> 1; } @@ -1847,12 +1858,17 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; + struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; + + if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) + flkeys = &_flkeys; + if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb); + fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -- cgit v1.2.3-70-g09d2 From 6853f21f764b04e58df5e44629fec1fb8f3cbf2e Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:29 +0200 Subject: ipmr,ipmr6: Define a uniform vif_device The two implementations have almost identical structures - vif_device and mif_device. As a step toward uniforming the mr_tables, eliminate the mif_device and relocate the vif_device definition into a new common header file. Also, introduce a common initializing function for setting most of the vif_device fields in a new common source file. This requires modifying the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common config option - CONFIG_IP_MROUTE_COMMON. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 13 +----------- include/linux/mroute6.h | 11 +--------- include/linux/mroute_base.h | 52 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/Kconfig | 5 +++++ net/ipv4/Makefile | 1 + net/ipv4/ipmr.c | 32 +++++++++++++--------------- net/ipv4/ipmr_base.c | 28 ++++++++++++++++++++++++ net/ipv6/Kconfig | 1 + net/ipv6/ip6mr.c | 37 ++++++++++++-------------------- 9 files changed, 117 insertions(+), 63 deletions(-) create mode 100644 include/linux/mroute_base.h create mode 100644 net/ipv4/ipmr_base.c (limited to 'net') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 5396521a776a..b8aadffe6237 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) @@ -56,18 +57,6 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) } #endif -struct vif_device { - struct net_device *dev; /* Device we are using */ - struct netdev_phys_item_id dev_parent_id; /* Device parent ID */ - unsigned long bytes_in,bytes_out; - unsigned long pkt_in,pkt_out; /* Statistics */ - unsigned long rate_limit; /* Traffic shaping (NI) */ - unsigned char threshold; /* TTL threshold */ - unsigned short flags; /* Control flags */ - __be32 local,remote; /* Addresses(remote for tunnels)*/ - int link; /* Physical interface index */ -}; - struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 3014c52bfd86..e5e5b8282551 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -7,6 +7,7 @@ #include /* for struct sk_buff_head */ #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -62,16 +63,6 @@ static inline void ip6_mr_cleanup(void) } #endif -struct mif_device { - struct net_device *dev; /* Device we are using */ - unsigned long bytes_in,bytes_out; - unsigned long pkt_in,pkt_out; /* Statistics */ - unsigned long rate_limit; /* Traffic shaping (NI) */ - unsigned char threshold; /* TTL threshold */ - unsigned short flags; /* Control flags */ - int link; /* Physical interface index */ -}; - #define VIFF_STATIC 0x8000 struct mfc6_cache { diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h new file mode 100644 index 000000000000..0de651e15f27 --- /dev/null +++ b/include/linux/mroute_base.h @@ -0,0 +1,52 @@ +#ifndef __LINUX_MROUTE_BASE_H +#define __LINUX_MROUTE_BASE_H + +#include + +/** + * struct vif_device - interface representor for multicast routing + * @dev: network device being used + * @bytes_in: statistic; bytes ingressing + * @bytes_out: statistic; bytes egresing + * @pkt_in: statistic; packets ingressing + * @pkt_out: statistic; packets egressing + * @rate_limit: Traffic shaping (NI) + * @threshold: TTL threshold + * @flags: Control flags + * @link: Physical interface index + * @dev_parent_id: device parent id + * @local: Local address + * @remote: Remote address for tunnels + */ +struct vif_device { + struct net_device *dev; + unsigned long bytes_in, bytes_out; + unsigned long pkt_in, pkt_out; + unsigned long rate_limit; + unsigned char threshold; + unsigned short flags; + int link; + + /* Currently only used by ipmr */ + struct netdev_phys_item_id dev_parent_id; + __be32 local, remote; +}; + +#ifdef CONFIG_IP_MROUTE_COMMON +void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask); +#else +static inline void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask) +{ +} +#endif +#endif diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index f48fe6fc7e8c..80dad301361d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -212,9 +212,14 @@ config NET_IPGRE_BROADCAST Network), but can be distributed all over the Internet. If you want to do that, say Y here and to "IP multicast routing" below. +config IP_MROUTE_COMMON + bool + depends on IP_MROUTE || IPV6_MROUTE + config IP_MROUTE bool "IP: multicast routing" depends on IP_MULTICAST + select IP_MROUTE_COMMON help This is used if you want your machine to act as a router for IP packets that have several destination addresses. It is needed on the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 47a0a6649a9d..a07b7dd06def 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o obj-$(CONFIG_NET_FOU) += fou.o diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 591d1fc80a1f..1370edad64bf 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -945,6 +945,10 @@ static int vif_add(struct net *net, struct mr_table *mrt, ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ + vif_device_init(v, dev, vifc->vifc_rate_limit, + vifc->vifc_threshold, + vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0), + (VIFF_TUNNEL | VIFF_REGISTER)); attr.orig_dev = dev; if (!switchdev_port_attr_get(dev, &attr)) { @@ -953,20 +957,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, } else { v->dev_parent_id.id_len = 0; } - v->rate_limit = vifc->vifc_rate_limit; + v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; - v->flags = vifc->vifc_flags; - if (!mrtsock) - v->flags |= VIFF_STATIC; - v->threshold = vifc->vifc_threshold; - v->bytes_in = 0; - v->bytes_out = 0; - v->pkt_in = 0; - v->pkt_out = 0; - v->link = dev->ifindex; - if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -2316,7 +2309,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, } if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + nla_put_u32(skb, RTA_IIF, + mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; if (c->mfc_flags & MFC_OFFLOAD) @@ -2327,6 +2321,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + struct vif_device *vif; + if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; @@ -2334,7 +2330,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; + vif = &mrt->vif_table[ct]; + nhp->rtnh_ifindex = vif->dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } @@ -2954,8 +2951,8 @@ struct ipmr_vif_iter { }; static struct vif_device *ipmr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) + struct ipmr_vif_iter *iter, + loff_t pos) { struct mr_table *mrt = iter->mrt; @@ -3020,7 +3017,8 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); } else { const struct vif_device *vif = v; - const char *name = vif->dev ? vif->dev->name : "none"; + const char *name = vif->dev ? + vif->dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c new file mode 100644 index 000000000000..22758f848344 --- /dev/null +++ b/net/ipv4/ipmr_base.c @@ -0,0 +1,28 @@ +/* Linux multicast routing support + * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation + */ + +#include + +/* Sets everything common except 'dev', since that is done under locking */ +void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask) +{ + v->dev = NULL; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->rate_limit = rate_limit; + v->flags = flags; + v->threshold = threshold; + if (v->flags & get_iflink_mask) + v->link = dev_get_iflink(dev); + else + v->link = dev->ifindex; +} +EXPORT_SYMBOL(vif_device_init); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index ea71e4b0ab7a..6794ddf0547c 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -278,6 +278,7 @@ config IPV6_SUBTREES config IPV6_MROUTE bool "IPv6: multicast routing" depends on IPV6 + select IP_MROUTE_COMMON ---help--- Experimental support for IPv6 multicast forwarding. If unsure, say N. diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 295eb5ecaee5..e397990f6eb8 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -62,7 +62,7 @@ struct mr6_table { struct timer_list ipmr_expire_timer; struct list_head mfc6_unres_queue; struct list_head mfc6_cache_array[MFC6_LINES]; - struct mif_device vif6_table[MAXMIFS]; + struct vif_device vif6_table[MAXMIFS]; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; @@ -384,7 +384,7 @@ struct ipmr_vif_iter { int ct; }; -static struct mif_device *ip6mr_vif_seq_idx(struct net *net, +static struct vif_device *ip6mr_vif_seq_idx(struct net *net, struct ipmr_vif_iter *iter, loff_t pos) { @@ -450,7 +450,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { - const struct mif_device *vif = v; + const struct vif_device *vif = v; const char *name = vif->dev ? vif->dev->name : "none"; seq_printf(seq, @@ -776,7 +776,7 @@ failure: static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, struct list_head *head) { - struct mif_device *v; + struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; @@ -929,7 +929,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; - struct mif_device *v = &mrt->vif6_table[vifi]; + struct vif_device *v = &mrt->vif6_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; @@ -980,21 +980,10 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, dev->ifindex, &in6_dev->cnf); } - /* - * Fill in the VIF structures - */ - v->rate_limit = vifc->vifc_rate_limit; - v->flags = vifc->mif6c_flags; - if (!mrtsock) - v->flags |= VIFF_STATIC; - v->threshold = vifc->vifc_threshold; - v->bytes_in = 0; - v->bytes_out = 0; - v->pkt_in = 0; - v->pkt_out = 0; - v->link = dev->ifindex; - if (v->flags & MIFF_REGISTER) - v->link = dev_get_iflink(dev); + /* Fill in the VIF structures */ + vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, + vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), + MIFF_REGISTER); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -1332,7 +1321,7 @@ static int ip6mr_device_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr6_table *mrt; - struct mif_device *v; + struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) @@ -1873,7 +1862,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) { struct sioc_sg_req6 sr; struct sioc_mif_req6 vr; - struct mif_device *vif; + struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr6_table *mrt; @@ -1947,7 +1936,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; - struct mif_device *vif; + struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr6_table *mrt; @@ -2018,7 +2007,7 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, int vifi) { struct ipv6hdr *ipv6h; - struct mif_device *vif = &mrt->vif6_table[vifi]; + struct vif_device *vif = &mrt->vif6_table[vifi]; struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; -- cgit v1.2.3-70-g09d2 From 8571ab479a6e1ef46ead5ebee567e128a422767c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:30 +0200 Subject: ip6mr: Make mroute_sk rcu-based In ipmr the mr_table socket is handled under RCU. Introduce the same for ip6mr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute6.h | 6 +++--- net/ipv6/ip6_output.c | 2 +- net/ipv6/ip6mr.c | 45 +++++++++++++++++++++++++++------------------ 3 files changed, 31 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e5e5b8282551..e1b9fb06e1ea 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -111,12 +111,12 @@ extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid); #ifdef CONFIG_IPV6_MROUTE -extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb); +bool mroute6_is_socket(struct net *net, struct sk_buff *skb); extern int ip6mr_sk_done(struct sock *sk); #else -static inline struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { - return NULL; + return false; } static inline int ip6mr_sk_done(struct sock *sk) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 997c7f19ad62..a6eb0e699b15 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && - ((mroute6_socket(net, skb) && + ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->saddr))) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e397990f6eb8..a0e297ddca6e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -58,7 +58,7 @@ struct mr6_table { struct list_head list; possible_net_t net; u32 id; - struct sock *mroute6_sk; + struct sock __rcu *mroute6_sk; struct timer_list ipmr_expire_timer; struct list_head mfc6_unres_queue; struct list_head mfc6_cache_array[MFC6_LINES]; @@ -1121,6 +1121,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { + struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; @@ -1190,17 +1191,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, skb->ip_summed = CHECKSUM_UNNECESSARY; } - if (!mrt->mroute6_sk) { + rcu_read_lock(); + mroute6_sk = rcu_dereference(mrt->mroute6_sk); + if (!mroute6_sk) { + rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); - /* - * Deliver to user space multicast routing algorithms - */ - ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb); + /* Deliver to user space multicast routing algorithms */ + ret = sock_queue_rcv_skb(mroute6_sk, skb); + rcu_read_unlock(); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); @@ -1584,11 +1587,11 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) rtnl_lock(); write_lock_bh(&mrt_lock); - if (likely(mrt->mroute6_sk == NULL)) { - mrt->mroute6_sk = sk; - net->ipv6.devconf_all->mc_forwarding++; - } else { + if (rtnl_dereference(mrt->mroute6_sk)) { err = -EADDRINUSE; + } else { + rcu_assign_pointer(mrt->mroute6_sk, sk); + net->ipv6.devconf_all->mc_forwarding++; } write_unlock_bh(&mrt_lock); @@ -1614,9 +1617,9 @@ int ip6mr_sk_done(struct sock *sk) rtnl_lock(); ip6mr_for_each_table(mrt, net) { - if (sk == mrt->mroute6_sk) { + if (sk == rtnl_dereference(mrt->mroute6_sk)) { write_lock_bh(&mrt_lock); - mrt->mroute6_sk = NULL; + RCU_INIT_POINTER(mrt->mroute6_sk, NULL); net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -1630,11 +1633,12 @@ int ip6mr_sk_done(struct sock *sk) } } rtnl_unlock(); + synchronize_rcu(); return err; } -struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr6_table *mrt; struct flowi6 fl6 = { @@ -1646,8 +1650,9 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; - return mrt->mroute6_sk; + return rcu_access_pointer(mrt->mroute6_sk); } +EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole @@ -1674,7 +1679,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns return -ENOENT; if (optname != MRT6_INIT) { - if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (sk != rcu_access_pointer(mrt->mroute6_sk) && + !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } @@ -1696,7 +1702,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); - ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk); + ret = mif6_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute6_sk)); rtnl_unlock(); return ret; @@ -1731,7 +1738,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, - sk == mrt->mroute6_sk, parent); + sk == + rtnl_dereference(mrt->mroute6_sk), + parent); rtnl_unlock(); return ret; @@ -1783,7 +1792,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; - if (sk == mrt->mroute6_sk) + if (sk == rcu_access_pointer(mrt->mroute6_sk)) return -EBUSY; rtnl_lock(); -- cgit v1.2.3-70-g09d2 From 87c418bf1323d57140f4b448715f64de3fbb7e91 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:31 +0200 Subject: ip6mr: Align hash implementation to ipmr Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has been using rhashtable as a basis for its mfc routes, but ip6mr is currently still using the old private MFC hash implementation. Align ip6mr to the current ipmr implementation. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute6.h | 30 ++--- net/ipv6/ip6mr.c | 313 ++++++++++++++++++++++++++---------------------- 2 files changed, 184 insertions(+), 159 deletions(-) (limited to 'net') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e1b9fb06e1ea..e2dac199861e 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -65,10 +66,20 @@ static inline void ip6_mr_cleanup(void) #define VIFF_STATIC 0x8000 +struct mfc6_cache_cmp_arg { + struct in6_addr mf6c_mcastgrp; + struct in6_addr mf6c_origin; +}; + struct mfc6_cache { - struct list_head list; - struct in6_addr mf6c_mcastgrp; /* Group the entry belongs to */ - struct in6_addr mf6c_origin; /* Source of packet */ + struct rhlist_head mnode; + union { + struct { + struct in6_addr mf6c_mcastgrp; + struct in6_addr mf6c_origin; + }; + struct mfc6_cache_cmp_arg cmparg; + }; mifi_t mf6c_parent; /* Source interface */ int mfc_flags; /* Flags on line */ @@ -88,22 +99,13 @@ struct mfc6_cache { unsigned char ttls[MAXMIFS]; /* TTL thresholds */ } res; } mfc_un; + struct list_head list; + struct rcu_head rcu; }; #define MFC_STATIC 1 #define MFC_NOTIFY 2 -#define MFC6_LINES 64 - -#define MFC6_HASH(a, g) (((__force u32)(a)->s6_addr32[0] ^ \ - (__force u32)(a)->s6_addr32[1] ^ \ - (__force u32)(a)->s6_addr32[2] ^ \ - (__force u32)(a)->s6_addr32[3] ^ \ - (__force u32)(g)->s6_addr32[0] ^ \ - (__force u32)(g)->s6_addr32[1] ^ \ - (__force u32)(g)->s6_addr32[2] ^ \ - (__force u32)(g)->s6_addr32[3]) % MFC6_LINES) - #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a0e297ddca6e..6f0b7f4894b2 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -61,8 +61,9 @@ struct mr6_table { struct sock __rcu *mroute6_sk; struct timer_list ipmr_expire_timer; struct list_head mfc6_unres_queue; - struct list_head mfc6_cache_array[MFC6_LINES]; struct vif_device vif6_table[MAXMIFS]; + struct rhltable mfc6_hash; + struct list_head mfc6_cache_list; int maxvif; atomic_t cache_resolve_queue_len; bool mroute_do_assert; @@ -299,10 +300,29 @@ static void __net_exit ip6mr_rules_exit(struct net *net) } #endif +static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct mfc6_cache_cmp_arg *cmparg = arg->key; + struct mfc6_cache *c = (struct mfc6_cache *)ptr; + + return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || + !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); +} + +static const struct rhashtable_params ip6mr_rht_params = { + .head_offset = offsetof(struct mfc6_cache, mnode), + .key_offset = offsetof(struct mfc6_cache, cmparg), + .key_len = sizeof(struct mfc6_cache_cmp_arg), + .nelem_hint = 3, + .locks_mul = 1, + .obj_cmpfn = ip6mr_hash_cmp, + .automatic_shrinking = true, +}; + static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) { struct mr6_table *mrt; - unsigned int i; mrt = ip6mr_get_table(net, id); if (mrt) @@ -314,10 +334,8 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) mrt->id = id; write_pnet(&mrt->net, net); - /* Forwarding cache */ - for (i = 0; i < MFC6_LINES; i++) - INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]); - + rhltable_init(&mrt->mfc6_hash, &ip6mr_rht_params); + INIT_LIST_HEAD(&mrt->mfc6_cache_list); INIT_LIST_HEAD(&mrt->mfc6_unres_queue); timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); @@ -335,6 +353,7 @@ static void ip6mr_free_table(struct mr6_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, true); + rhltable_destroy(&mrt->mfc6_hash); kfree(mrt); } @@ -344,7 +363,6 @@ struct ipmr_mfc_iter { struct seq_net_private p; struct mr6_table *mrt; struct list_head *cache; - int ct; }; @@ -354,14 +372,12 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, struct mr6_table *mrt = it->mrt; struct mfc6_cache *mfc; - read_lock(&mrt_lock); - for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) { - it->cache = &mrt->mfc6_cache_array[it->ct]; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return mfc; - } - read_unlock(&mrt_lock); + rcu_read_lock(); + it->cache = &mrt->mfc6_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) + if (pos-- == 0) + return mfc; + rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc6_unres_queue; @@ -517,19 +533,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (it->cache == &mrt->mfc6_unres_queue) goto end_of_list; - BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]); - - while (++it->ct < MFC6_LINES) { - it->cache = &mrt->mfc6_cache_array[it->ct]; - if (list_empty(it->cache)) - continue; - return list_first_entry(it->cache, struct mfc6_cache, list); - } - /* exhausted cache_array, show unresolved */ - read_unlock(&mrt_lock); + rcu_read_unlock(); it->cache = &mrt->mfc6_unres_queue; - it->ct = 0; spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) @@ -549,8 +555,8 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc6_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc6_cache_array[it->ct]) - read_unlock(&mrt_lock); + else if (it->cache == &mrt->mfc6_cache_list) + rcu_read_unlock(); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -827,11 +833,18 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, return 0; } -static inline void ip6mr_cache_free(struct mfc6_cache *c) +static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { + struct mfc6_cache *c = container_of(head, struct mfc6_cache, rcu); + kmem_cache_free(mrt_cachep, c); } +static inline void ip6mr_cache_free(struct mfc6_cache *c) +{ + call_rcu(&c->rcu, ip6mr_cache_free_rcu); +} + /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ @@ -1002,14 +1015,17 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { - int line = MFC6_HASH(mcastgrp, origin); + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = *origin, + .mf6c_mcastgrp = *mcastgrp, + }; + struct rhlist_head *tmp, *list; struct mfc6_cache *c; - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, origin) && - ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) - return c; - } + list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + return c; + return NULL; } @@ -1017,13 +1033,16 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, mifi_t mifi) { - int line = MFC6_HASH(&in6addr_any, &in6addr_any); + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = in6addr_any, + .mf6c_mcastgrp = in6addr_any, + }; + struct rhlist_head *tmp, *list; struct mfc6_cache *c; - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) - if (ipv6_addr_any(&c->mf6c_origin) && - ipv6_addr_any(&c->mf6c_mcastgrp) && - (c->mfc_un.res.ttls[mifi] < 255)) + list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[mifi] < 255) return c; return NULL; @@ -1034,29 +1053,53 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { - int line = MFC6_HASH(mcastgrp, &in6addr_any); + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = in6addr_any, + .mf6c_mcastgrp = *mcastgrp, + }; + struct rhlist_head *tmp, *list; struct mfc6_cache *c, *proxy; if (ipv6_addr_any(mcastgrp)) goto skip; - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) - if (ipv6_addr_any(&c->mf6c_origin) && - ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) { - if (c->mfc_un.res.ttls[mifi] < 255) - return c; - - /* It's ok if the mifi is part of the static tree */ - proxy = ip6mr_cache_find_any_parent(mrt, - c->mf6c_parent); - if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) - return c; - } + list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[mifi] < 255) + return c; + + /* It's ok if the mifi is part of the static tree */ + proxy = ip6mr_cache_find_any_parent(mrt, c->mf6c_parent); + if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) + return c; + } skip: return ip6mr_cache_find_any_parent(mrt, mifi); } +/* Look for a (S,G,iif) entry if parent != -1 */ +static struct mfc6_cache * +ip6mr_cache_find_parent(struct mr6_table *mrt, + const struct in6_addr *origin, + const struct in6_addr *mcastgrp, + int parent) +{ + struct mfc6_cache_cmp_arg arg = { + .mf6c_origin = *origin, + .mf6c_mcastgrp = *mcastgrp, + }; + struct rhlist_head *tmp, *list; + struct mfc6_cache *c; + + list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mf6c_parent) + return c; + + return NULL; +} + /* * Allocate a multicast cache entry */ @@ -1296,26 +1339,21 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, int parent) { - int line; - struct mfc6_cache *c, *next; - - line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); + struct mfc6_cache *c; - list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, - &mfc->mf6cc_mcastgrp.sin6_addr) && - (parent == -1 || parent == c->mf6c_parent)) { - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, + &mfc->mf6cc_mcastgrp.sin6_addr, parent); + rcu_read_unlock(); + if (!c) + return -ENOENT; + rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params); + list_del_rcu(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); - return 0; - } - } - return -ENOENT; + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_cache_free(c); + return 0; } static int ip6mr_device_event(struct notifier_block *this, @@ -1448,11 +1486,10 @@ void ip6_mr_cleanup(void) static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { - bool found = false; - int line; - struct mfc6_cache *uc, *c; unsigned char ttls[MAXMIFS]; - int i; + struct mfc6_cache *uc, *c; + bool found; + int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; @@ -1461,22 +1498,14 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; - - } - - line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr); - - list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) { - if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) && - ipv6_addr_equal(&c->mf6c_mcastgrp, - &mfc->mf6cc_mcastgrp.sin6_addr) && - (parent == -1 || parent == mfc->mf6cc_parent)) { - found = true; - break; - } } - if (found) { + /* The entries are added/deleted only under RTNL */ + rcu_read_lock(); + c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, + &mfc->mf6cc_mcastgrp.sin6_addr, parent); + rcu_read_unlock(); + if (c) { write_lock_bh(&mrt_lock); c->mf6c_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, c, ttls); @@ -1502,13 +1531,17 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - write_lock_bh(&mrt_lock); - list_add(&c->list, &mrt->mfc6_cache_array[line]); - write_unlock_bh(&mrt_lock); + err = rhltable_insert_key(&mrt->mfc6_hash, &c->cmparg, &c->mnode, + ip6mr_rht_params); + if (err) { + pr_err("ip6mr: rhtable insert error %d\n", err); + ip6mr_cache_free(c); + return err; + } + list_add_tail_rcu(&c->list, &mrt->mfc6_cache_list); - /* - * Check to see if we resolved a queued list. If so we - * need to send on the frames and tidy up. + /* Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); @@ -1539,13 +1572,11 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, static void mroute_clean_tables(struct mr6_table *mrt, bool all) { - int i; + struct mfc6_cache *c, *tmp; LIST_HEAD(list); - struct mfc6_cache *c, *next; + int i; - /* - * Shut down all active vif entries - */ + /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) continue; @@ -1553,25 +1584,19 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all) } unregister_netdevice_many(&list); - /* - * Wipe the cache - */ - for (i = 0; i < MFC6_LINES; i++) { - list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); - - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); - } + /* Wipe the cache */ + list_for_each_entry_safe(c, tmp, &mrt->mfc6_cache_list, list) { + if (!all && (c->mfc_flags & MFC_STATIC)) + continue; + rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params); + list_del_rcu(&c->list); + mr6_netlink_event(mrt, c, RTM_DELROUTE); + ip6mr_cache_free(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc6_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, c); @@ -1905,19 +1930,19 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -1979,19 +2004,19 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -2115,10 +2140,14 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ + rcu_read_lock(); cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + cache_proxy->mfc_un.res.ttls[true_vifi] < 255) { + rcu_read_unlock(); goto forward; + } + rcu_read_unlock(); } /* @@ -2535,34 +2564,30 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) struct mr6_table *mrt; struct mfc6_cache *mfc; unsigned int t = 0, s_t; - unsigned int h = 0, s_h; unsigned int e = 0, s_e; s_t = cb->args[0]; - s_h = cb->args[1]; - s_e = cb->args[2]; + s_e = cb->args[1]; - read_lock(&mrt_lock); + rcu_read_lock(); ip6mr_for_each_table(mrt, net) { if (t < s_t) goto next_table; - if (t > s_t) - s_h = 0; - for (h = s_h; h < MFC6_LINES; h++) { - list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) { - if (e < s_e) - goto next_entry; - if (ip6mr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) - goto done; + list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) { + if (e < s_e) + goto next_entry; + if (ip6mr_fill_mroute(mrt, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + mfc, RTM_NEWROUTE, + NLM_F_MULTI) < 0) + goto done; next_entry: - e++; - } - e = s_e = 0; + e++; } + e = 0; + s_e = 0; + spin_lock_bh(&mfc_unres_lock); list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) { if (e < s_e) @@ -2580,15 +2605,13 @@ next_entry2: } spin_unlock_bh(&mfc_unres_lock); e = s_e = 0; - s_h = 0; next_table: t++; } done: - read_unlock(&mrt_lock); + rcu_read_unlock(); - cb->args[2] = e; - cb->args[1] = h; + cb->args[1] = e; cb->args[0] = t; return skb->len; -- cgit v1.2.3-70-g09d2 From b70432f7319eb75b24ca57dde8146c5e27244780 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:32 +0200 Subject: mroute*: Make mr_table a common struct Following previous changes to ip6mr, mr_table and mr6_table are basically the same [up to mr6_table having additional '6' suffixes to its variable names]. Move the common structure definition into a common header; This requires renaming all references in ip6mr to variables that had the distinct suffix. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 ---- include/linux/mroute6.h | 1 - include/linux/mroute_base.h | 46 +++++++ include/net/netns/ipv6.h | 2 +- net/ipv4/ipmr.c | 2 - net/ipv6/ip6mr.c | 301 ++++++++++++++++++++------------------------ 6 files changed, 186 insertions(+), 187 deletions(-) (limited to 'net') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index b8aadffe6237..8688c5d03a24 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -4,8 +4,6 @@ #include #include -#include -#include #include #include #include @@ -67,25 +65,6 @@ struct vif_entry_notifier_info { #define VIFF_STATIC 0x8000 -#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) - -struct mr_table { - struct list_head list; - possible_net_t net; - u32 id; - struct sock __rcu *mroute_sk; - struct timer_list ipmr_expire_timer; - struct list_head mfc_unres_queue; - struct vif_device vif_table[MAXVIFS]; - struct rhltable mfc_hash; - struct list_head mfc_cache_list; - int maxvif; - atomic_t cache_resolve_queue_len; - bool mroute_do_assert; - bool mroute_do_pim; - int mroute_reg_vif_num; -}; - /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e2dac199861e..d5c8dc155a42 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,7 +8,6 @@ #include #include #include -#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0de651e15f27..1cc944a14df5 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,6 +2,9 @@ #define __LINUX_MROUTE_BASE_H #include +#include +#include +#include /** * struct vif_device - interface representor for multicast routing @@ -32,6 +35,49 @@ struct vif_device { __be32 local, remote; }; +#ifndef MAXVIFS +/* This one is nasty; value is defined in uapi using different symbols for + * mroute and morute6 but both map into same 32. + */ +#define MAXVIFS 32 +#endif + +#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) + +/** + * struct mr_table - a multicast routing table + * @list: entry within a list of multicast routing tables + * @net: net where this table belongs + * @id: identifier of the table + * @mroute_sk: socket associated with the table + * @ipmr_expire_timer: timer for handling unresolved routes + * @mfc_unres_queue: list of unresolved MFC entries + * @vif_table: array containing all possible vifs + * @mfc_hash: Hash table of all resolved routes for easy lookup + * @mfc_cache_list: list of resovled routes for possible traversal + * @maxvif: Identifier of highest value vif currently in use + * @cache_resolve_queue_len: current size of unresolved queue + * @mroute_do_assert: Whether to inform userspace on wrong ingress + * @mroute_do_pim: Whether to receive IGMP PIMv1 + * @mroute_reg_vif_num: PIM-device vif index + */ +struct mr_table { + struct list_head list; + possible_net_t net; + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct vif_device vif_table[MAXVIFS]; + struct rhltable mfc_hash; + struct list_head mfc_cache_list; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; + int mroute_reg_vif_num; +}; + #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 2b9194229a56..e286fda09fcf 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -85,7 +85,7 @@ struct netns_ipv6 { struct sock *mc_autojoin_sk; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES - struct mr6_table *mrt6; + struct mr_table *mrt6; #else struct list_head mr6_tables; struct fib_rules_ops *mr6_rules_ops; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1370edad64bf..a1bf0020cad1 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -53,7 +52,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6f0b7f4894b2..adbb826ca8fd 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -36,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -54,31 +52,12 @@ #include #include -struct mr6_table { - struct list_head list; - possible_net_t net; - u32 id; - struct sock __rcu *mroute6_sk; - struct timer_list ipmr_expire_timer; - struct list_head mfc6_unres_queue; - struct vif_device vif6_table[MAXMIFS]; - struct rhltable mfc6_hash; - struct list_head mfc6_cache_list; - int maxvif; - atomic_t cache_resolve_queue_len; - bool mroute_do_assert; - bool mroute_do_pim; -#ifdef CONFIG_IPV6_PIMSM_V2 - int mroute_reg_vif_num; -#endif -}; - struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { - struct mr6_table *mrt; + struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. @@ -87,11 +66,7 @@ struct ip6mr_result { static DEFINE_RWLOCK(mrt_lock); -/* - * Multicast router control variables - */ - -#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL) +/* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); @@ -106,30 +81,30 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; -static struct mr6_table *ip6mr_new_table(struct net *net, u32 id); -static void ip6mr_free_table(struct mr6_table *mrt); +static struct mr_table *ip6mr_new_table(struct net *net, u32 id); +static void ip6mr_free_table(struct mr_table *mrt); -static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, +static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *cache); -static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); -static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, +static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm); -static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, +static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); -static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt); +static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr6_table *mrt, bool all); +static void mroute_clean_tables(struct mr_table *mrt, bool all); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) -static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { - struct mr6_table *mrt; + struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) @@ -139,7 +114,7 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, - struct mr6_table **mrt) + struct mr_table **mrt) { int err; struct ip6mr_result res; @@ -160,7 +135,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; - struct mr6_table *mrt; + struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: @@ -228,7 +203,7 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; - struct mr6_table *mrt; + struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); @@ -259,7 +234,7 @@ err1: static void __net_exit ip6mr_rules_exit(struct net *net) { - struct mr6_table *mrt, *next; + struct mr_table *mrt, *next; rtnl_lock(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { @@ -273,13 +248,13 @@ static void __net_exit ip6mr_rules_exit(struct net *net) #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) -static struct mr6_table *ip6mr_get_table(struct net *net, u32 id) +static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, - struct mr6_table **mrt) + struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; @@ -320,9 +295,9 @@ static const struct rhashtable_params ip6mr_rht_params = { .automatic_shrinking = true, }; -static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) +static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, id); if (mrt) @@ -334,9 +309,9 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) mrt->id = id; write_pnet(&mrt->net, net); - rhltable_init(&mrt->mfc6_hash, &ip6mr_rht_params); - INIT_LIST_HEAD(&mrt->mfc6_cache_list); - INIT_LIST_HEAD(&mrt->mfc6_unres_queue); + rhltable_init(&mrt->mfc_hash, &ip6mr_rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); + INIT_LIST_HEAD(&mrt->mfc_unres_queue); timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); @@ -349,11 +324,11 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id) return mrt; } -static void ip6mr_free_table(struct mr6_table *mrt) +static void ip6mr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, true); - rhltable_destroy(&mrt->mfc6_hash); + rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -361,7 +336,7 @@ static void ip6mr_free_table(struct mr6_table *mrt) struct ipmr_mfc_iter { struct seq_net_private p; - struct mr6_table *mrt; + struct mr_table *mrt; struct list_head *cache; }; @@ -369,18 +344,18 @@ struct ipmr_mfc_iter { static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { - struct mr6_table *mrt = it->mrt; + struct mr_table *mrt = it->mrt; struct mfc6_cache *mfc; rcu_read_lock(); - it->cache = &mrt->mfc6_cache_list; - list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) return mfc; rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc6_unres_queue; + it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; @@ -396,7 +371,7 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_vif_iter { struct seq_net_private p; - struct mr6_table *mrt; + struct mr_table *mrt; int ct; }; @@ -404,13 +379,13 @@ static struct vif_device *ip6mr_vif_seq_idx(struct net *net, struct ipmr_vif_iter *iter, loff_t pos) { - struct mr6_table *mrt = iter->mrt; + struct mr_table *mrt = iter->mrt; for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!MIF_EXISTS(mrt, iter->ct)) + if (!VIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) - return &mrt->vif6_table[iter->ct]; + return &mrt->vif_table[iter->ct]; } return NULL; } @@ -420,7 +395,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) { struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) @@ -437,16 +412,16 @@ static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ipmr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt = iter->mrt; + struct mr_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return ip6mr_vif_seq_idx(net, iter, 0); while (++iter->ct < mrt->maxvif) { - if (!MIF_EXISTS(mrt, iter->ct)) + if (!VIF_EXISTS(mrt, iter->ct)) continue; - return &mrt->vif6_table[iter->ct]; + return &mrt->vif_table[iter->ct]; } return NULL; } @@ -460,7 +435,7 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct ipmr_vif_iter *iter = seq->private; - struct mr6_table *mrt = iter->mrt; + struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -471,7 +446,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", - vif - mrt->vif6_table, + vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); @@ -503,7 +478,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) @@ -520,7 +495,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct mfc6_cache *mfc = v; struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); - struct mr6_table *mrt = it->mrt; + struct mr_table *mrt = it->mrt; ++*pos; @@ -530,12 +505,12 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (mfc->list.next != it->cache) return list_entry(mfc->list.next, struct mfc6_cache, list); - if (it->cache == &mrt->mfc6_unres_queue) + if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; /* exhausted cache_array, show unresolved */ rcu_read_unlock(); - it->cache = &mrt->mfc6_unres_queue; + it->cache = &mrt->mfc_unres_queue; spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) @@ -551,11 +526,11 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) { struct ipmr_mfc_iter *it = seq->private; - struct mr6_table *mrt = it->mrt; + struct mr_table *mrt = it->mrt; - if (it->cache == &mrt->mfc6_unres_queue) + if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc6_cache_list) + else if (it->cache == &mrt->mfc_cache_list) rcu_read_unlock(); } @@ -571,20 +546,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) } else { const struct mfc6_cache *mfc = v; const struct ipmr_mfc_iter *it = seq->private; - struct mr6_table *mrt = it->mrt; + struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->mf6c_parent); - if (it->cache != &mrt->mfc6_unres_queue) { + if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->mfc_un.res.pkt, mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; n < mfc->mfc_un.res.maxvif; n++) { - if (MIF_EXISTS(mrt, n) && + if (VIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", @@ -630,7 +605,7 @@ static int pim6_rcv(struct sk_buff *skb) struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, @@ -664,7 +639,7 @@ static int pim6_rcv(struct sk_buff *skb) read_lock(&mrt_lock); if (reg_vif_num >= 0) - reg_dev = mrt->vif6_table[reg_vif_num].dev; + reg_dev = mrt->vif_table[reg_vif_num].dev; if (reg_dev) dev_hold(reg_dev); read_unlock(&mrt_lock); @@ -699,7 +674,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, @@ -742,7 +717,7 @@ static void reg_vif_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) +static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; @@ -779,7 +754,7 @@ failure: * Delete a VIF entry */ -static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, +static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; @@ -789,7 +764,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; - v = &mrt->vif6_table[vifi]; + v = &mrt->vif_table[vifi]; write_lock_bh(&mrt_lock); dev = v->dev; @@ -808,7 +783,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { - if (MIF_EXISTS(mrt, tmp)) + if (VIF_EXISTS(mrt, tmp)) break; } mrt->maxvif = tmp + 1; @@ -849,7 +824,7 @@ static inline void ip6mr_cache_free(struct mfc6_cache *c) and reporting error to netlink readers. */ -static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) +static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; @@ -875,13 +850,13 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c) /* Timer process for all the unresolved queue. */ -static void ipmr_do_expire_process(struct mr6_table *mrt) +static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mfc6_cache *c, *next; - list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) { + list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; @@ -895,20 +870,20 @@ static void ipmr_do_expire_process(struct mr6_table *mrt) ip6mr_destroy_unres(mrt, c); } - if (!list_empty(&mrt->mfc6_unres_queue)) + if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { - struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer); + struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } - if (!list_empty(&mrt->mfc6_unres_queue)) + if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); @@ -916,7 +891,8 @@ static void ipmr_expire_process(struct timer_list *t) /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache, +static void ip6mr_update_thresholds(struct mr_table *mrt, + struct mfc6_cache *cache, unsigned char *ttls) { int vifi; @@ -926,7 +902,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { - if (MIF_EXISTS(mrt, vifi) && + if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) @@ -938,17 +914,17 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca cache->mfc_un.res.lastuse = jiffies; } -static int mif6_add(struct net *net, struct mr6_table *mrt, +static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; - struct vif_device *v = &mrt->vif6_table[vifi]; + struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ - if (MIF_EXISTS(mrt, vifi)) + if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { @@ -1011,7 +987,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, return 0; } -static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, +static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { @@ -1022,7 +998,7 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, struct rhlist_head *tmp, *list; struct mfc6_cache *c; - list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) return c; @@ -1030,7 +1006,7 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt, } /* Look for a (*,*,oif) entry */ -static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, +static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { @@ -1040,7 +1016,7 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, struct rhlist_head *tmp, *list; struct mfc6_cache *c; - list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[mifi] < 255) return c; @@ -1049,7 +1025,7 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt, } /* Look for a (*,G) entry */ -static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, +static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { @@ -1063,7 +1039,7 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt, if (ipv6_addr_any(mcastgrp)) goto skip; - list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { if (c->mfc_un.res.ttls[mifi] < 255) return c; @@ -1080,7 +1056,7 @@ skip: /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * -ip6mr_cache_find_parent(struct mr6_table *mrt, +ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) @@ -1092,7 +1068,7 @@ ip6mr_cache_find_parent(struct mr6_table *mrt, struct rhlist_head *tmp, *list; struct mfc6_cache *c; - list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params); + list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (parent == -1 || parent == c->mf6c_parent) return c; @@ -1127,7 +1103,7 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void) * A cache entry has gone into a resolved state from queued */ -static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, +static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; @@ -1161,7 +1137,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt, * Called under mrt_lock. */ -static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, +static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; @@ -1235,7 +1211,7 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, } rcu_read_lock(); - mroute6_sk = rcu_dereference(mrt->mroute6_sk); + mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { rcu_read_unlock(); kfree_skb(skb); @@ -1260,14 +1236,14 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt, */ static int -ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) +ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb) { bool found = false; int err; struct mfc6_cache *c; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc6_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; @@ -1311,7 +1287,7 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc6_unres_queue); + list_add(&c->list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); @@ -1336,7 +1312,7 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb) * MFC6 cache manipulation by user space */ -static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, +static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; @@ -1348,7 +1324,7 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc, rcu_read_unlock(); if (!c) return -ENOENT; - rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params); + rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); mr6_netlink_event(mrt, c, RTM_DELROUTE); @@ -1361,7 +1337,7 @@ static int ip6mr_device_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct vif_device *v; int ct; @@ -1369,7 +1345,7 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { - v = &mrt->vif6_table[0]; + v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (v->dev == dev) mif6_delete(mrt, ct, 1, NULL); @@ -1483,7 +1459,7 @@ void ip6_mr_cleanup(void) kmem_cache_destroy(mrt_cachep); } -static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, +static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; @@ -1531,21 +1507,21 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - err = rhltable_insert_key(&mrt->mfc6_hash, &c->cmparg, &c->mnode, + err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } - list_add_tail_rcu(&c->list, &mrt->mfc6_cache_list); + list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) { + list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&uc->list); @@ -1554,7 +1530,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, break; } } - if (list_empty(&mrt->mfc6_unres_queue)) + if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); @@ -1570,7 +1546,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr6_table *mrt, bool all) +static void mroute_clean_tables(struct mr_table *mrt, bool all) { struct mfc6_cache *c, *tmp; LIST_HEAD(list); @@ -1578,17 +1554,17 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all) /* Shut down all active vif entries */ for (i = 0; i < mrt->maxvif; i++) { - if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) + if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); /* Wipe the cache */ - list_for_each_entry_safe(c, tmp, &mrt->mfc6_cache_list, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (!all && (c->mfc_flags & MFC_STATIC)) continue; - rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params); + rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_cache_free(c); @@ -1596,7 +1572,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all) if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, tmp, &mrt->mfc6_unres_queue, list) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, c); @@ -1605,17 +1581,17 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all) } } -static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) +static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); write_lock_bh(&mrt_lock); - if (rtnl_dereference(mrt->mroute6_sk)) { + if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { - rcu_assign_pointer(mrt->mroute6_sk, sk); + rcu_assign_pointer(mrt->mroute_sk, sk); net->ipv6.devconf_all->mc_forwarding++; } write_unlock_bh(&mrt_lock); @@ -1634,7 +1610,7 @@ int ip6mr_sk_done(struct sock *sk) { int err = -EACCES; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1642,9 +1618,9 @@ int ip6mr_sk_done(struct sock *sk) rtnl_lock(); ip6mr_for_each_table(mrt, net) { - if (sk == rtnl_dereference(mrt->mroute6_sk)) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { write_lock_bh(&mrt_lock); - RCU_INIT_POINTER(mrt->mroute6_sk, NULL); + RCU_INIT_POINTER(mrt->mroute_sk, NULL); net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -1665,7 +1641,7 @@ int ip6mr_sk_done(struct sock *sk) bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, @@ -1675,7 +1651,7 @@ bool mroute6_is_socket(struct net *net, struct sk_buff *skb) if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; - return rcu_access_pointer(mrt->mroute6_sk); + return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); @@ -1693,7 +1669,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1704,7 +1680,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns return -ENOENT; if (optname != MRT6_INIT) { - if (sk != rcu_access_pointer(mrt->mroute6_sk) && + if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } @@ -1728,7 +1704,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, - sk == rtnl_dereference(mrt->mroute6_sk)); + sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; @@ -1764,7 +1740,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == - rtnl_dereference(mrt->mroute6_sk), + rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; @@ -1817,7 +1793,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; - if (sk == rcu_access_pointer(mrt->mroute6_sk)) + if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); @@ -1848,7 +1824,7 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int olr; int val; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) @@ -1899,7 +1875,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) @@ -1912,8 +1888,8 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &mrt->vif6_table[vr.mifi]; - if (MIF_EXISTS(mrt, vr.mifi)) { + vif = &mrt->vif_table[vr.mifi]; + if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -1973,7 +1949,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); - struct mr6_table *mrt; + struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) @@ -1986,8 +1962,8 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) if (vr.mifi >= mrt->maxvif) return -EINVAL; read_lock(&mrt_lock); - vif = &mrt->vif6_table[vr.mifi]; - if (MIF_EXISTS(mrt, vr.mifi)) { + vif = &mrt->vif_table[vr.mifi]; + if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = vif->pkt_in; vr.ocount = vif->pkt_out; vr.ibytes = vif->bytes_in; @@ -2037,11 +2013,11 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct * Processing handlers for ip6mr_forward */ -static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, +static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, int vifi) { struct ipv6hdr *ipv6h; - struct vif_device *vif = &mrt->vif6_table[vifi]; + struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; @@ -2111,18 +2087,18 @@ out_free: return 0; } -static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev) +static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; for (ct = mrt->maxvif - 1; ct >= 0; ct--) { - if (mrt->vif6_table[ct].dev == dev) + if (mrt->vif_table[ct].dev == dev) break; } return ct; } -static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, +static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *cache) { int psend = -1; @@ -2153,7 +2129,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, /* * Wrong interface: drop packet and (maybe) send PIM assert. */ - if (mrt->vif6_table[vif].dev != skb->dev) { + if (mrt->vif_table[vif].dev != skb->dev) { cache->mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && @@ -2173,8 +2149,8 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt, } forward: - mrt->vif6_table[vif].pkt_in++; - mrt->vif6_table[vif].bytes_in += skb->len; + mrt->vif_table[vif].pkt_in++; + mrt->vif_table[vif].bytes_in += skb->len; /* * Forward the frame @@ -2225,7 +2201,7 @@ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); - struct mr6_table *mrt; + struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, @@ -2276,7 +2252,7 @@ int ip6_mr_input(struct sk_buff *skb) } -static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, +static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *c, struct rtmsg *rtm) { struct rta_mfc_stats mfcs; @@ -2291,15 +2267,16 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, return -ENOENT; } - if (MIF_EXISTS(mrt, c->mf6c_parent) && - nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0) + if (VIF_EXISTS(mrt, c->mf6c_parent) && + nla_put_u32(skb, RTA_IIF, + mrt->vif_table[c->mf6c_parent].dev->ifindex) < 0) return -EMSGSIZE; mp_attr = nla_nest_start(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { nla_nest_cancel(skb, mp_attr); @@ -2308,7 +2285,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex; + nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } @@ -2334,7 +2311,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; - struct mr6_table *mrt; + struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); @@ -2403,7 +2380,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, return err; } -static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, +static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { @@ -2468,7 +2445,7 @@ static int mr6_msgsize(bool unresolved, int maxvif) return len; } -static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc, +static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); @@ -2510,7 +2487,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen) return len; } -static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt) +static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; @@ -2561,7 +2538,7 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct mr6_table *mrt; + struct mr_table *mrt; struct mfc6_cache *mfc; unsigned int t = 0, s_t; unsigned int e = 0, s_e; @@ -2573,7 +2550,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) ip6mr_for_each_table(mrt, net) { if (t < s_t) goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { if (e < s_e) goto next_entry; if (ip6mr_fill_mroute(mrt, skb, @@ -2589,7 +2566,7 @@ next_entry: s_e = 0; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) { + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; if (ip6mr_fill_mroute(mrt, skb, -- cgit v1.2.3-70-g09d2 From 0bbbf0e7d0e7ea8267836986346a9b3a35b74e4e Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:33 +0200 Subject: ipmr, ip6mr: Unite creation of new mr_table Now that both ipmr and ip6mr are using the same mr_table structure, we can have a common function to allocate & initialize a new instance. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 17 +++++++++++++++++ net/ipv4/ipmr.c | 27 ++++++++++----------------- net/ipv4/ipmr_base.c | 27 +++++++++++++++++++++++++++ net/ipv6/ip6mr.c | 30 ++++++++++-------------------- 4 files changed, 64 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 1cc944a14df5..805305722803 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -85,6 +85,13 @@ void vif_device_init(struct vif_device *v, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); + +struct mr_table * +mr_table_alloc(struct net *net, u32 id, + const struct rhashtable_params *rht_params, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -94,5 +101,15 @@ static inline void vif_device_init(struct vif_device *v, unsigned short get_iflink_mask) { } + +static inline struct mr_table * +mr_table_alloc(struct net *net, u32 id, + const struct rhashtable_params *rht_params, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)) +{ + return NULL; +} #endif #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a1bf0020cad1..f213933db177 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -352,6 +352,14 @@ static const struct rhashtable_params ipmr_rht_params = { .automatic_shrinking = true, }; +static void ipmr_new_table_set(struct mr_table *mrt, + struct net *net) +{ +#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); +#endif +} + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -364,23 +372,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) if (mrt) return mrt; - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (!mrt) - return ERR_PTR(-ENOMEM); - write_pnet(&mrt->net, net); - mrt->id = id; - - rhltable_init(&mrt->mfc_hash, &ipmr_rht_params); - INIT_LIST_HEAD(&mrt->mfc_cache_list); - INIT_LIST_HEAD(&mrt->mfc_unres_queue); - - timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); - - mrt->mroute_reg_vif_num = -1; -#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES - list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables); -#endif - return mrt; + return mr_table_alloc(net, id, &ipmr_rht_params, + ipmr_expire_process, ipmr_new_table_set); } static void ipmr_free_table(struct mr_table *mrt) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 22758f848344..3e21a5806d0e 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -26,3 +26,30 @@ void vif_device_init(struct vif_device *v, v->link = dev->ifindex; } EXPORT_SYMBOL(vif_device_init); + +struct mr_table * +mr_table_alloc(struct net *net, u32 id, + const struct rhashtable_params *rht_params, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)) +{ + struct mr_table *mrt; + + mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); + if (!mrt) + return NULL; + mrt->id = id; + write_pnet(&mrt->net, net); + + rhltable_init(&mrt->mfc_hash, rht_params); + INIT_LIST_HEAD(&mrt->mfc_cache_list); + INIT_LIST_HEAD(&mrt->mfc_unres_queue); + + timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); + + mrt->mroute_reg_vif_num = -1; + table_set(mrt, net); + return mrt; +} +EXPORT_SYMBOL(mr_table_alloc); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index adbb826ca8fd..d50852882966 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -295,6 +294,14 @@ static const struct rhashtable_params ip6mr_rht_params = { .automatic_shrinking = true, }; +static void ip6mr_new_table_set(struct mr_table *mrt, + struct net *net) +{ +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES + list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); +#endif +} + static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -303,25 +310,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) if (mrt) return mrt; - mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); - if (!mrt) - return NULL; - mrt->id = id; - write_pnet(&mrt->net, net); - - rhltable_init(&mrt->mfc_hash, &ip6mr_rht_params); - INIT_LIST_HEAD(&mrt->mfc_cache_list); - INIT_LIST_HEAD(&mrt->mfc_unres_queue); - - timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0); - -#ifdef CONFIG_IPV6_PIMSM_V2 - mrt->mroute_reg_vif_num = -1; -#endif -#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES - list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); -#endif - return mrt; + return mr_table_alloc(net, id, &ip6mr_rht_params, + ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) -- cgit v1.2.3-70-g09d2 From 494fff56379c4ad5b8fe36a5b7ffede4044ca7bb Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:34 +0200 Subject: ipmr, ip6mr: Make mfc_cache a common structure mfc_cache and mfc6_cache are almost identical - the main difference is in the origin/group addresses and comparison-key. Make a common structure encapsulating most of the multicast routing logic - mr_mfc and convert both ipmr and ip6mr into using it. For easy conversion [casting, in this case] mr_mfc has to be the first field inside every multicast routing abstraction utilizing it. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c | 21 +- include/linux/mroute.h | 45 +--- include/linux/mroute6.h | 23 +- include/linux/mroute_base.h | 45 ++++ net/ipv4/ipmr.c | 233 ++++++++++---------- net/ipv6/ip6mr.c | 248 +++++++++++----------- 6 files changed, 312 insertions(+), 303 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index d20b143de3b4..978a3c70653a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -126,8 +126,8 @@ mlxsw_sp_mr_route_ivif_in_evifs(const struct mlxsw_sp_mr_route *mr_route) switch (mr_route->mr_table->proto) { case MLXSW_SP_L3_PROTO_IPV4: - ivif = mr_route->mfc4->mfc_parent; - return mr_route->mfc4->mfc_un.res.ttls[ivif] != 255; + ivif = mr_route->mfc4->_c.mfc_parent; + return mr_route->mfc4->_c.mfc_un.res.ttls[ivif] != 255; case MLXSW_SP_L3_PROTO_IPV6: /* fall through */ default: @@ -364,7 +364,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, mr_route->mfc4 = mfc; mr_route->mr_table = mr_table; for (i = 0; i < MAXVIFS; i++) { - if (mfc->mfc_un.res.ttls[i] != 255) { + if (mfc->_c.mfc_un.res.ttls[i] != 255) { err = mlxsw_sp_mr_route_evif_link(mr_route, &mr_table->vifs[i]); if (err) @@ -374,7 +374,8 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, mr_route->min_mtu = mr_table->vifs[i].dev->mtu; } } - mlxsw_sp_mr_route_ivif_link(mr_route, &mr_table->vifs[mfc->mfc_parent]); + mlxsw_sp_mr_route_ivif_link(mr_route, + &mr_table->vifs[mfc->_c.mfc_parent]); mr_route->route_action = mlxsw_sp_mr_route_action(mr_route); return mr_route; @@ -418,9 +419,9 @@ static void mlxsw_sp_mr_mfc_offload_set(struct mlxsw_sp_mr_route *mr_route, switch (mr_route->mr_table->proto) { case MLXSW_SP_L3_PROTO_IPV4: if (offload) - mr_route->mfc4->mfc_flags |= MFC_OFFLOAD; + mr_route->mfc4->_c.mfc_flags |= MFC_OFFLOAD; else - mr_route->mfc4->mfc_flags &= ~MFC_OFFLOAD; + mr_route->mfc4->_c.mfc_flags &= ~MFC_OFFLOAD; break; case MLXSW_SP_L3_PROTO_IPV6: /* fall through */ @@ -943,10 +944,10 @@ static void mlxsw_sp_mr_route_stats_update(struct mlxsw_sp *mlxsw_sp, switch (mr_route->mr_table->proto) { case MLXSW_SP_L3_PROTO_IPV4: - if (mr_route->mfc4->mfc_un.res.pkt != packets) - mr_route->mfc4->mfc_un.res.lastuse = jiffies; - mr_route->mfc4->mfc_un.res.pkt = packets; - mr_route->mfc4->mfc_un.res.bytes = bytes; + if (mr_route->mfc4->_c.mfc_un.res.pkt != packets) + mr_route->mfc4->_c.mfc_un.res.lastuse = jiffies; + mr_route->mfc4->_c.mfc_un.res.pkt = packets; + mr_route->mfc4->_c.mfc_un.res.bytes = bytes; break; case MLXSW_SP_L3_PROTO_IPV6: /* fall through */ diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 8688c5d03a24..63b36e6c72a0 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -81,28 +81,13 @@ struct mfc_cache_cmp_arg { /** * struct mfc_cache - multicast routing entries - * @mnode: rhashtable list + * @_c: Common multicast routing information; has to be first [for casting] * @mfc_mcastgrp: destination multicast group address * @mfc_origin: source address * @cmparg: used for rhashtable comparisons - * @mfc_parent: source interface (iif) - * @mfc_flags: entry flags - * @expires: unresolved entry expire time - * @unresolved: unresolved cached skbs - * @last_assert: time of last assert - * @minvif: minimum VIF id - * @maxvif: maximum VIF id - * @bytes: bytes that have passed for this entry - * @pkt: packets that have passed for this entry - * @wrong_if: number of wrong source interface hits - * @lastuse: time of last use of the group (traffic or update) - * @ttls: OIF TTL threshold array - * @refcount: reference count for this entry - * @list: global entry list - * @rcu: used for entry destruction */ struct mfc_cache { - struct rhlist_head mnode; + struct mr_mfc _c; union { struct { __be32 mfc_mcastgrp; @@ -110,28 +95,6 @@ struct mfc_cache { }; struct mfc_cache_cmp_arg cmparg; }; - vifi_t mfc_parent; - int mfc_flags; - - union { - struct { - unsigned long expires; - struct sk_buff_head unresolved; - } unres; - struct { - unsigned long last_assert; - int minvif; - int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; - unsigned long lastuse; - unsigned char ttls[MAXVIFS]; - refcount_t refcount; - } res; - } mfc_un; - struct list_head list; - struct rcu_head rcu; }; struct mfc_entry_notifier_info { @@ -155,12 +118,12 @@ static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) static inline void ipmr_cache_put(struct mfc_cache *c) { - if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + if (refcount_dec_and_test(&c->_c.mfc_un.res.refcount)) ipmr_cache_free(c); } static inline void ipmr_cache_hold(struct mfc_cache *c) { - refcount_inc(&c->mfc_un.res.refcount); + refcount_inc(&c->_c.mfc_un.res.refcount); } #endif diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index d5c8dc155a42..6acf576fc135 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -71,7 +71,7 @@ struct mfc6_cache_cmp_arg { }; struct mfc6_cache { - struct rhlist_head mnode; + struct mr_mfc _c; union { struct { struct in6_addr mf6c_mcastgrp; @@ -79,27 +79,6 @@ struct mfc6_cache { }; struct mfc6_cache_cmp_arg cmparg; }; - mifi_t mf6c_parent; /* Source interface */ - int mfc_flags; /* Flags on line */ - - union { - struct { - unsigned long expires; - struct sk_buff_head unresolved; /* Unresolved buffers */ - } unres; - struct { - unsigned long last_assert; - int minvif; - int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; - unsigned long lastuse; - unsigned char ttls[MAXMIFS]; /* TTL thresholds */ - } res; - } mfc_un; - struct list_head list; - struct rcu_head rcu; }; #define MFC_STATIC 1 diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 805305722803..2769e2f98b32 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -44,6 +44,51 @@ struct vif_device { #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) +/** + * struct mr_mfc - common multicast routing entries + * @mnode: rhashtable list + * @mfc_parent: source interface (iif) + * @mfc_flags: entry flags + * @expires: unresolved entry expire time + * @unresolved: unresolved cached skbs + * @last_assert: time of last assert + * @minvif: minimum VIF id + * @maxvif: maximum VIF id + * @bytes: bytes that have passed for this entry + * @pkt: packets that have passed for this entry + * @wrong_if: number of wrong source interface hits + * @lastuse: time of last use of the group (traffic or update) + * @ttls: OIF TTL threshold array + * @refcount: reference count for this entry + * @list: global entry list + * @rcu: used for entry destruction + */ +struct mr_mfc { + struct rhlist_head mnode; + unsigned short mfc_parent; + int mfc_flags; + + union { + struct { + unsigned long expires; + struct sk_buff_head unresolved; + } unres; + struct { + unsigned long last_assert; + int minvif; + int maxvif; + unsigned long bytes; + unsigned long pkt; + unsigned long wrong_if; + unsigned long lastuse; + unsigned char ttls[MAXVIFS]; + refcount_t refcount; + } res; + } mfc_un; + struct list_head list; + struct rcu_head rcu; +}; + /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f213933db177..3f7515086e85 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -106,7 +106,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm); + struct mr_mfc *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); @@ -343,7 +343,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params ipmr_rht_params = { - .head_offset = offsetof(struct mfc_cache, mnode), + .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc_cache, cmparg), .key_len = sizeof(struct mfc_cache_cmp_arg), .nelem_hint = 3, @@ -752,14 +752,14 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, static void ipmr_cache_free_rcu(struct rcu_head *head) { - struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); - kmem_cache_free(mrt_cachep, c); + kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } void ipmr_cache_free(struct mfc_cache *c) { - call_rcu(&c->rcu, ipmr_cache_free_rcu); + call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } EXPORT_SYMBOL(ipmr_cache_free); @@ -774,7 +774,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) { + while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); @@ -798,9 +798,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); - unsigned long now; + struct mr_mfc *c, *next; unsigned long expires; - struct mfc_cache *c, *next; + unsigned long now; if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10); @@ -822,8 +822,8 @@ static void ipmr_expire_process(struct timer_list *t) } list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE); + ipmr_destroy_unres(mrt, (struct mfc_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) @@ -834,7 +834,7 @@ out: } /* Fill oifs list. It is called under write locked mrt_lock. */ -static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache, +static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; @@ -974,11 +974,11 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, .mfc_origin = origin }; struct rhlist_head *tmp, *list; - struct mfc_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) - return c; + return (struct mfc_cache *)c; return NULL; } @@ -992,12 +992,12 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, .mfc_origin = htonl(INADDR_ANY) }; struct rhlist_head *tmp, *list; - struct mfc_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[vifi] < 255) - return c; + return (struct mfc_cache *)c; return NULL; } @@ -1011,20 +1011,22 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, .mfc_origin = htonl(INADDR_ANY) }; struct rhlist_head *tmp, *list; - struct mfc_cache *c, *proxy; + struct mr_mfc *c; if (mcastgrp == htonl(INADDR_ANY)) goto skip; list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { + struct mfc_cache *proxy; + if (c->mfc_un.res.ttls[vifi] < 255) - return c; + return (struct mfc_cache *)c; /* It's ok if the vifi is part of the static tree */ proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); - if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) - return c; + if (proxy && proxy->_c.mfc_un.res.ttls[vifi] < 255) + return (struct mfc_cache *)c; } skip: @@ -1041,12 +1043,12 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, .mfc_origin = origin, }; struct rhlist_head *tmp, *list; - struct mfc_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (parent == -1 || parent == c->mfc_parent) - return c; + return (struct mfc_cache *)c; return NULL; } @@ -1057,9 +1059,9 @@ static struct mfc_cache *ipmr_cache_alloc(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (c) { - c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; - c->mfc_un.res.minvif = MAXVIFS; - refcount_set(&c->mfc_un.res.refcount, 1); + c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; + c->_c.mfc_un.res.minvif = MAXVIFS; + refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; } @@ -1069,8 +1071,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void) struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (c) { - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); + c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; } return c; } @@ -1083,12 +1085,13 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsgerr *e; /* Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + if (__ipmr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { @@ -1196,7 +1199,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, int err; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { found = true; @@ -1215,12 +1218,13 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } /* Fill in the new cache entry */ - c->mfc_parent = -1; + c->_c.mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); + if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker @@ -1233,15 +1237,16 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc_unres_queue); + list_add(&c->_c.list, &mrt->mfc_unres_queue); mroute_netlink_event(mrt, c, RTM_NEWROUTE); if (atomic_read(&mrt->cache_resolve_queue_len) == 1) - mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); + mod_timer(&mrt->ipmr_expire_timer, + c->_c.mfc_un.unres.expires); } /* See if we can append the packet */ - if (c->mfc_un.unres.unresolved.qlen > 3) { + if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1249,7 +1254,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, skb->dev = dev; skb->skb_iif = dev->ifindex; } - skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -1271,8 +1276,8 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) rcu_read_unlock(); if (!c) return -ENOENT; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); - list_del_rcu(&c->list); + rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params); + list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); ipmr_cache_put(c); @@ -1284,6 +1289,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, struct mfcctl *mfc, int mrtsock, int parent) { struct mfc_cache *uc, *c; + struct mr_mfc *_uc; bool found; int ret; @@ -1297,10 +1303,10 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, rcu_read_unlock(); if (c) { write_lock_bh(&mrt_lock); - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); @@ -1318,28 +1324,29 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, c->mfc_origin = mfc->mfcc_origin.s_addr; c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr; - c->mfc_parent = mfc->mfcc_parent; - ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls); + c->_c.mfc_parent = mfc->mfcc_parent; + ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; - ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ipmr_rht_params); if (ret) { pr_err("ipmr: rhtable insert error %d\n", ret); ipmr_cache_free(c); return ret; } - list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); + list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { + uc = (struct mfc_cache *)_uc; if (uc->mfc_origin == c->mfc_origin && uc->mfc_mcastgrp == c->mfc_mcastgrp) { - list_del(&uc->list); + list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; @@ -1362,7 +1369,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, static void mroute_clean_tables(struct mr_table *mrt, bool all) { struct net *net = read_pnet(&mrt->net); - struct mfc_cache *c, *tmp; + struct mr_mfc *c, *tmp; + struct mfc_cache *cache; LIST_HEAD(list); int i; @@ -1380,18 +1388,20 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); list_del_rcu(&c->list); - call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, + cache = (struct mfc_cache *)c; + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_cache_put(cache); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_destroy_unres(mrt, c); + cache = (struct mfc_cache *)c; + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_destroy_unres(mrt, cache); } spin_unlock_bh(&mfc_unres_lock); } @@ -1683,9 +1693,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1757,9 +1767,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1983,18 +1993,18 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev) /* "local" means that we should preserve one skb (for local delivery) */ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, - struct mfc_cache *cache, int local) + struct mfc_cache *c, int local) { int true_vifi = ipmr_find_vif(mrt, dev); int psend = -1; int vif, ct; - vif = cache->mfc_parent; - cache->mfc_un.res.pkt++; - cache->mfc_un.res.bytes += skb->len; - cache->mfc_un.res.lastuse = jiffies; + vif = c->_c.mfc_parent; + c->_c.mfc_un.res.pkt++; + c->_c.mfc_un.res.bytes += skb->len; + c->_c.mfc_un.res.lastuse = jiffies; - if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { + if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; /* For an (*,G) entry, we only check that the incomming @@ -2002,7 +2012,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, */ cache_proxy = ipmr_cache_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) + cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } @@ -2023,7 +2033,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - cache->mfc_un.res.wrong_if++; + c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2032,10 +2042,11 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || - cache->mfc_un.res.ttls[true_vifi] < 255) && + c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, - cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { - cache->mfc_un.res.last_assert = jiffies; + c->_c.mfc_un.res.last_assert + + MFC_ASSERT_THRESH)) { + c->_c.mfc_un.res.last_assert = jiffies; ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); } goto dont_forward; @@ -2046,33 +2057,33 @@ forward: mrt->vif_table[vif].bytes_in += skb->len; /* Forward the frame */ - if (cache->mfc_origin == htonl(INADDR_ANY) && - cache->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { if (true_vifi >= 0 && - true_vifi != cache->mfc_parent && + true_vifi != c->_c.mfc_parent && ip_hdr(skb)->ttl > - cache->mfc_un.res.ttls[cache->mfc_parent]) { + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ - psend = cache->mfc_parent; + psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } - for (ct = cache->mfc_un.res.maxvif - 1; - ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ - if ((cache->mfc_origin != htonl(INADDR_ANY) || + if ((c->mfc_origin != htonl(INADDR_ANY) || ct != true_vifi) && - ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { + ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, - skb2, cache, psend); + skb2, c, psend); } psend = ct; } @@ -2084,9 +2095,9 @@ last_forward: if (skb2) ipmr_queue_xmit(net, mrt, true_vifi, skb2, - cache, psend); + c, psend); } else { - ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend); + ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend); return; } } @@ -2285,7 +2296,7 @@ drop: #endif static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc_cache *c, struct rtmsg *rtm) + struct mr_mfc *c, struct rtmsg *rtm) { struct rta_mfc_stats mfcs; struct nlattr *mp_attr; @@ -2401,7 +2412,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } read_lock(&mrt_lock); - err = __ipmr_fill_mroute(mrt, skb, cache, rtm); + err = __ipmr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); rcu_read_unlock(); return err; @@ -2429,7 +2440,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - if (c->mfc_flags & MFC_STATIC) + if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; @@ -2438,7 +2449,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; - err = __ipmr_fill_mroute(mrt, skb, c, rtm); + err = __ipmr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2479,7 +2490,8 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif), + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, + mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; @@ -2624,10 +2636,10 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct mr_table *mrt; - struct mfc_cache *mfc; unsigned int t = 0, s_t; unsigned int e = 0, s_e; + struct mr_table *mrt; + struct mr_mfc *mfc; s_t = cb->args[0]; s_e = cb->args[1]; @@ -2642,8 +2654,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) + (struct mfc_cache *)mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) goto done; next_entry: e++; @@ -2658,8 +2670,8 @@ next_entry: if (ipmr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) { + (struct mfc_cache *)mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) { spin_unlock_bh(&mfc_unres_lock); goto done; } @@ -3051,20 +3063,21 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc; + struct mr_mfc *mfc; rcu_read_lock(); it->cache = &mrt->mfc_cache_list; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) - return mfc; + return (struct mfc_cache *)mfc; rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) - return mfc; + return (struct mfc_cache *)mfc; + spin_unlock_bh(&mfc_unres_lock); it->cache = NULL; @@ -3100,8 +3113,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return ipmr_mfc_seq_idx(net, seq->private, 0); - if (mfc->list.next != it->cache) - return list_entry(mfc->list.next, struct mfc_cache, list); + if (mfc->_c.list.next != it->cache) + return (struct mfc_cache *)(list_entry(mfc->_c.list.next, + struct mr_mfc, list)); if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; @@ -3112,7 +3126,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) - return list_first_entry(it->cache, struct mfc_cache, list); + return (struct mfc_cache *)(list_first_entry(it->cache, + struct mr_mfc, + list)); end_of_list: spin_unlock_bh(&mfc_unres_lock); @@ -3147,20 +3163,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08X %08X %-3hd", (__force u32) mfc->mfc_mcastgrp, (__force u32) mfc->mfc_origin, - mfc->mfc_parent); + mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->mfc_un.res.pkt, - mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); - for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++) { + mfc->_c.mfc_un.res.pkt, + mfc->_c.mfc_un.res.bytes, + mfc->_c.mfc_un.res.wrong_if); + for (n = mfc->_c.mfc_un.res.minvif; + n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && - mfc->mfc_un.res.ttls[n] < 255) + mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", - n, mfc->mfc_un.res.ttls[n]); + n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain @@ -3219,7 +3235,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb) ipmr_for_each_table(mrt, net) { struct vif_device *v = &mrt->vif_table[0]; - struct mfc_cache *mfc; + struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ @@ -3236,7 +3252,8 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb) /* Notify on table MFC entries */ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) call_ipmr_mfc_entry_notifier(nb, net, - FIB_EVENT_ENTRY_ADD, mfc, + FIB_EVENT_ENTRY_ADD, + (struct mfc_cache *)mfc, mrt->id); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index d50852882966..3fe254f9f9b7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -285,7 +285,7 @@ static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, } static const struct rhashtable_params ip6mr_rht_params = { - .head_offset = offsetof(struct mfc6_cache, mnode), + .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, @@ -335,20 +335,20 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, struct ipmr_mfc_iter *it, loff_t pos) { struct mr_table *mrt = it->mrt; - struct mfc6_cache *mfc; + struct mr_mfc *mfc; rcu_read_lock(); it->cache = &mrt->mfc_cache_list; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) - return mfc; + return (struct mfc6_cache *)mfc; rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) - return mfc; + return (struct mfc6_cache *)mfc; spin_unlock_bh(&mfc_unres_lock); it->cache = NULL; @@ -492,8 +492,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (v == SEQ_START_TOKEN) return ipmr_mfc_seq_idx(net, seq->private, 0); - if (mfc->list.next != it->cache) - return list_entry(mfc->list.next, struct mfc6_cache, list); + if (mfc->_c.list.next != it->cache) + return (struct mfc6_cache *)(list_entry(mfc->_c.list.next, + struct mr_mfc, list)); if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; @@ -504,7 +505,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) spin_lock_bh(&mfc_unres_lock); if (!list_empty(it->cache)) - return list_first_entry(it->cache, struct mfc6_cache, list); + return (struct mfc6_cache *)(list_first_entry(it->cache, + struct mr_mfc, + list)); end_of_list: spin_unlock_bh(&mfc_unres_lock); @@ -540,20 +543,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, - mfc->mf6c_parent); + mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->mfc_un.res.pkt, - mfc->mfc_un.res.bytes, - mfc->mfc_un.res.wrong_if); - for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++) { + mfc->_c.mfc_un.res.pkt, + mfc->_c.mfc_un.res.bytes, + mfc->_c.mfc_un.res.wrong_if); + for (n = mfc->_c.mfc_un.res.minvif; + n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && - mfc->mfc_un.res.ttls[n] < 255) + mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, - " %2d:%-3d", - n, mfc->mfc_un.res.ttls[n]); + " %2d:%-3d", n, + mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain @@ -800,14 +803,14 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { - struct mfc6_cache *c = container_of(head, struct mfc6_cache, rcu); + struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); - kmem_cache_free(mrt_cachep, c); + kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { - call_rcu(&c->rcu, ip6mr_cache_free_rcu); + call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs @@ -821,7 +824,7 @@ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) atomic_dec(&mrt->cache_resolve_queue_len); - while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) { + while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); @@ -844,7 +847,7 @@ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; - struct mfc6_cache *c, *next; + struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { @@ -856,8 +859,8 @@ static void ipmr_do_expire_process(struct mr_table *mrt) } list_del(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_destroy_unres(mrt, c); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); + ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) @@ -882,7 +885,7 @@ static void ipmr_expire_process(struct timer_list *t) /* Fill oifs list. It is called under write locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, - struct mfc6_cache *cache, + struct mr_mfc *cache, unsigned char *ttls) { int vifi; @@ -986,11 +989,11 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, .mf6c_mcastgrp = *mcastgrp, }; struct rhlist_head *tmp, *list; - struct mfc6_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) - return c; + return (struct mfc6_cache *)c; return NULL; } @@ -1004,12 +1007,12 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt, .mf6c_mcastgrp = in6addr_any, }; struct rhlist_head *tmp, *list; - struct mfc6_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[mifi] < 255) - return c; + return (struct mfc6_cache *)c; return NULL; } @@ -1024,7 +1027,8 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, .mf6c_mcastgrp = *mcastgrp, }; struct rhlist_head *tmp, *list; - struct mfc6_cache *c, *proxy; + struct mr_mfc *c; + struct mfc6_cache *proxy; if (ipv6_addr_any(mcastgrp)) goto skip; @@ -1032,12 +1036,12 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { if (c->mfc_un.res.ttls[mifi] < 255) - return c; + return (struct mfc6_cache *)c; /* It's ok if the mifi is part of the static tree */ - proxy = ip6mr_cache_find_any_parent(mrt, c->mf6c_parent); - if (proxy && proxy->mfc_un.res.ttls[mifi] < 255) - return c; + proxy = ip6mr_cache_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->_c.mfc_un.res.ttls[mifi] < 255) + return (struct mfc6_cache *)c; } skip: @@ -1056,12 +1060,12 @@ ip6mr_cache_find_parent(struct mr_table *mrt, .mf6c_mcastgrp = *mcastgrp, }; struct rhlist_head *tmp, *list; - struct mfc6_cache *c; + struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (parent == -1 || parent == c->mf6c_parent) - return c; + if (parent == -1 || parent == c->mfc_parent) + return (struct mfc6_cache *)c; return NULL; } @@ -1074,8 +1078,8 @@ static struct mfc6_cache *ip6mr_cache_alloc(void) struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; - c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; - c->mfc_un.res.minvif = MAXMIFS; + c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; + c->_c.mfc_un.res.minvif = MAXMIFS; return c; } @@ -1084,8 +1088,8 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void) struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10 * HZ; + skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); + c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } @@ -1102,7 +1106,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, * Play the pending entries through our router */ - while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { + while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); @@ -1221,19 +1225,16 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, return ret; } -/* - * Queue a packet for resolution. It gets locked cache entry! - */ - -static int -ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb) +/* Queue a packet for resolution. It gets locked cache entry! */ +static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, + struct sk_buff *skb) { + struct mfc6_cache *c; bool found = false; int err; - struct mfc6_cache *c; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(c, &mrt->mfc_unres_queue, list) { + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; @@ -1254,10 +1255,8 @@ ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb) return -ENOBUFS; } - /* - * Fill in the new cache entry - */ - c->mf6c_parent = -1; + /* Fill in the new cache entry */ + c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; @@ -1277,20 +1276,18 @@ ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb) } atomic_inc(&mrt->cache_resolve_queue_len); - list_add(&c->list, &mrt->mfc_unres_queue); + list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } - /* - * See if we can append the packet - */ - if (c->mfc_un.unres.unresolved.qlen > 3) { + /* See if we can append the packet */ + if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { - skb_queue_tail(&c->mfc_un.unres.unresolved, skb); + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } @@ -1314,8 +1311,8 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, rcu_read_unlock(); if (!c) return -ENOENT; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); - list_del_rcu(&c->list); + rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); + list_del_rcu(&c->_c.list); mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_cache_free(c); @@ -1454,6 +1451,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; + struct mr_mfc *_uc; bool found; int i, err; @@ -1473,10 +1471,10 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, rcu_read_unlock(); if (c) { write_lock_bh(&mrt_lock); - c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(mrt, c, ttls); + c->_c.mfc_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; @@ -1492,29 +1490,30 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; - c->mf6c_parent = mfc->mf6cc_parent; - ip6mr_update_thresholds(mrt, c, ttls); + c->_c.mfc_parent = mfc->mf6cc_parent; + ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) - c->mfc_flags |= MFC_STATIC; + c->_c.mfc_flags |= MFC_STATIC; - err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode, + err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } - list_add_tail_rcu(&c->list, &mrt->mfc_cache_list); + list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(uc, &mrt->mfc_unres_queue, list) { + list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { + uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { - list_del(&uc->list); + list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; @@ -1538,7 +1537,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, static void mroute_clean_tables(struct mr_table *mrt, bool all) { - struct mfc6_cache *c, *tmp; + struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; @@ -1556,16 +1555,17 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); + ip6mr_cache_free((struct mfc6_cache *)c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); - mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_destroy_unres(mrt, c); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, + RTM_DELROUTE); + ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } @@ -1899,9 +1899,9 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1973,9 +1973,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->mfc_un.res.pkt; - sr.bytecnt = c->mfc_un.res.bytes; - sr.wrong_if = c->mfc_un.res.wrong_if; + sr.pktcnt = c->_c.mfc_un.res.pkt; + sr.bytecnt = c->_c.mfc_un.res.bytes; + sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -2089,18 +2089,18 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) } static void ip6_mr_forward(struct net *net, struct mr_table *mrt, - struct sk_buff *skb, struct mfc6_cache *cache) + struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, skb->dev); - vif = cache->mf6c_parent; - cache->mfc_un.res.pkt++; - cache->mfc_un.res.bytes += skb->len; - cache->mfc_un.res.lastuse = jiffies; + vif = c->_c.mfc_parent; + c->_c.mfc_un.res.pkt++; + c->_c.mfc_un.res.bytes += skb->len; + c->_c.mfc_un.res.lastuse = jiffies; - if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) { + if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming @@ -2109,7 +2109,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, rcu_read_lock(); cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); if (cache_proxy && - cache_proxy->mfc_un.res.ttls[true_vifi] < 255) { + cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) { rcu_read_unlock(); goto forward; } @@ -2120,7 +2120,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * Wrong interface: drop packet and (maybe) send PIM assert. */ if (mrt->vif_table[vif].dev != skb->dev) { - cache->mfc_un.res.wrong_if++; + c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2129,10 +2129,11 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || - cache->mfc_un.res.ttls[true_vifi] < 255) && + c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, - cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { - cache->mfc_un.res.last_assert = jiffies; + c->_c.mfc_un.res.last_assert + + MFC_ASSERT_THRESH)) { + c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); } goto dont_forward; @@ -2145,36 +2146,38 @@ forward: /* * Forward the frame */ - if (ipv6_addr_any(&cache->mf6c_origin) && - ipv6_addr_any(&cache->mf6c_mcastgrp)) { + if (ipv6_addr_any(&c->mf6c_origin) && + ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && - true_vifi != cache->mf6c_parent && + true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > - cache->mfc_un.res.ttls[cache->mf6c_parent]) { + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ - psend = cache->mf6c_parent; + psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } - for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ - if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) && - ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) { + if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && + ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) - ip6mr_forward2(net, mrt, skb2, cache, psend); + ip6mr_forward2(net, mrt, skb2, + c, psend); } psend = ct; } } last_forward: if (psend != -1) { - ip6mr_forward2(net, mrt, skb, cache, psend); + ip6mr_forward2(net, mrt, skb, c, psend); return; } @@ -2252,21 +2255,22 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mf6c_parent >= MAXMIFS) { + if (c->_c.mfc_parent >= MAXMIFS) { rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; } - if (VIF_EXISTS(mrt, c->mf6c_parent) && + if (VIF_EXISTS(mrt, c->_c.mfc_parent) && nla_put_u32(skb, RTA_IIF, - mrt->vif_table[c->mf6c_parent].dev->ifindex) < 0) + mrt->vif_table[c->_c.mfc_parent].dev->ifindex) < 0) return -EMSGSIZE; mp_attr = nla_nest_start(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; - for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + for (ct = c->_c.mfc_un.res.minvif; + ct < c->_c.mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->_c.mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { nla_nest_cancel(skb, mp_attr); @@ -2274,7 +2278,7 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, } nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_hops = c->_c.mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } @@ -2282,12 +2286,12 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, nla_nest_end(skb, mp_attr); - lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = READ_ONCE(c->_c.mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + mfcs.mfcs_packets = c->_c.mfc_un.res.pkt; + mfcs.mfcs_bytes = c->_c.mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->_c.mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) @@ -2363,7 +2367,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, } if (rtm->rtm_flags & RTM_F_NOTIFY) - cache->mfc_flags |= MFC_NOTIFY; + cache->_c.mfc_flags |= MFC_NOTIFY; err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); @@ -2392,7 +2396,7 @@ static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; - if (c->mfc_flags & MFC_STATIC) + if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; @@ -2442,7 +2446,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, struct sk_buff *skb; int err = -ENOBUFS; - skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif), + skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; @@ -2528,10 +2532,10 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct mr_table *mrt; - struct mfc6_cache *mfc; unsigned int t = 0, s_t; unsigned int e = 0, s_e; + struct mr_table *mrt; + struct mr_mfc *mfc; s_t = cb->args[0]; s_e = cb->args[1]; @@ -2546,8 +2550,8 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) if (ip6mr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) + (struct mfc6_cache *)mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) goto done; next_entry: e++; @@ -2562,8 +2566,8 @@ next_entry: if (ip6mr_fill_mroute(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - mfc, RTM_NEWROUTE, - NLM_F_MULTI) < 0) { + (struct mfc6_cache *)mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) { spin_unlock_bh(&mfc_unres_lock); goto done; } -- cgit v1.2.3-70-g09d2 From 845c9a7ae7f5342ba42280c3a2f2aa92bce641d7 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:35 +0200 Subject: ipmr, ip6mr: Unite logic for searching in MFC cache ipmr and ip6mr utilize the exact same methods for searching the hashed resolved connections, difference being only in the construction of the hash comparison key. In order to unite the flow, introduce an mr_table operation set that would contain the protocol specific information required for common flows, in this case - the hash parameters and a comparison key representing a (*,*) route. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 52 +++++++++++++++++++++++++++++-- net/ipv4/ipmr.c | 71 ++++++++++--------------------------------- net/ipv4/ipmr_base.c | 54 +++++++++++++++++++++++++++++++-- net/ipv6/ip6mr.c | 74 +++++++++++---------------------------------- 4 files changed, 134 insertions(+), 117 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 2769e2f98b32..46a082e25dab 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -89,10 +89,23 @@ struct mr_mfc { struct rcu_head rcu; }; +struct mr_table; + +/** + * struct mr_table_ops - callbacks and info for protocol-specific ops + * @rht_params: parameters for accessing the MFC hash + * @cmparg_any: a hash key to be used for matching on (*,*) routes + */ +struct mr_table_ops { + const struct rhashtable_params *rht_params; + void *cmparg_any; +}; + /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs + * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes @@ -109,6 +122,7 @@ struct mr_mfc { struct mr_table { struct list_head list; possible_net_t net; + struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; @@ -133,10 +147,19 @@ void vif_device_init(struct vif_device *v, struct mr_table * mr_table_alloc(struct net *net, u32 id, - const struct rhashtable_params *rht_params, + struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); + +/* These actually return 'struct mr_mfc *', but to avoid need for explicit + * castings they simply return void. + */ +void *mr_mfc_find_parent(struct mr_table *mrt, + void *hasharg, int parent); +void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); +void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); + #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -147,14 +170,37 @@ static inline void vif_device_init(struct vif_device *v, { } -static inline struct mr_table * +static inline void * mr_table_alloc(struct net *net, u32 id, - const struct rhashtable_params *rht_params, + struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) { return NULL; } + +static inline void *mr_mfc_find_parent(struct mr_table *mrt, + void *hasharg, int parent) +{ + return NULL; +} + +static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, + int vifi) +{ + return NULL; +} + +static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, + int vifi, void *hasharg) +{ + return NULL; +} #endif + +static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) +{ + return mr_mfc_find_parent(mrt, hasharg, -1); +} #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3f7515086e85..00898c319cc5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -360,6 +360,16 @@ static void ipmr_new_table_set(struct mr_table *mrt, #endif } +static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = { + .mfc_mcastgrp = htonl(INADDR_ANY), + .mfc_origin = htonl(INADDR_ANY), +}; + +static struct mr_table_ops ipmr_mr_table_ops = { + .rht_params = &ipmr_rht_params, + .cmparg_any = &ipmr_mr_table_ops_cmparg_any, +}; + static struct mr_table *ipmr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -372,7 +382,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) if (mrt) return mrt; - return mr_table_alloc(net, id, &ipmr_rht_params, + return mr_table_alloc(net, id, &ipmr_mr_table_ops, ipmr_expire_process, ipmr_new_table_set); } @@ -973,33 +983,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - return (struct mfc_cache *)c; - - return NULL; -} - -/* Look for a (*,*,oif) entry */ -static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt, - int vifi) -{ - struct mfc_cache_cmp_arg arg = { - .mfc_mcastgrp = htonl(INADDR_ANY), - .mfc_origin = htonl(INADDR_ANY) - }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (c->mfc_un.res.ttls[vifi] < 255) - return (struct mfc_cache *)c; - return NULL; + return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ @@ -1010,27 +995,10 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = htonl(INADDR_ANY) }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; if (mcastgrp == htonl(INADDR_ANY)) - goto skip; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) { - struct mfc_cache *proxy; - - if (c->mfc_un.res.ttls[vifi] < 255) - return (struct mfc_cache *)c; - - /* It's ok if the vifi is part of the static tree */ - proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent); - if (proxy && proxy->_c.mfc_un.res.ttls[vifi] < 255) - return (struct mfc_cache *)c; - } - -skip: - return ipmr_cache_find_any_parent(mrt, vifi); + return mr_mfc_find_any_parent(mrt, vifi); + return mr_mfc_find_any(mrt, vifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ @@ -1042,15 +1010,8 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt, .mfc_mcastgrp = mcastgrp, .mfc_origin = origin, }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (parent == -1 || parent == c->mfc_parent) - return (struct mfc_cache *)c; - - return NULL; + return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ @@ -2010,7 +1971,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, /* For an (*,G) entry, we only check that the incomming * interface is part of the static tree. */ - cache_proxy = ipmr_cache_find_any_parent(mrt, vif); + cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 3e21a5806d0e..172f92acffef 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -29,7 +29,7 @@ EXPORT_SYMBOL(vif_device_init); struct mr_table * mr_table_alloc(struct net *net, u32 id, - const struct rhashtable_params *rht_params, + struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) @@ -42,7 +42,8 @@ mr_table_alloc(struct net *net, u32 id, mrt->id = id; write_pnet(&mrt->net, net); - rhltable_init(&mrt->mfc_hash, rht_params); + mrt->ops = *ops; + rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); @@ -53,3 +54,52 @@ mr_table_alloc(struct net *net, u32 id, return mrt; } EXPORT_SYMBOL(mr_table_alloc); + +void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (parent == -1 || parent == c->mfc_parent) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_parent); + +void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c; + + list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, + *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_find_any_parent); + +void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) +{ + struct rhlist_head *tmp, *list; + struct mr_mfc *c, *proxy; + + list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); + rhl_for_each_entry_rcu(c, tmp, list, mnode) { + if (c->mfc_un.res.ttls[vifi] < 255) + return c; + + /* It's ok if the vifi is part of the static tree */ + proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); + if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) + return c; + } + + return mr_mfc_find_any_parent(mrt, vifi); +} +EXPORT_SYMBOL(mr_mfc_find_any); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 3fe254f9f9b7..ea902a952fb6 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -302,6 +302,16 @@ static void ip6mr_new_table_set(struct mr_table *mrt, #endif } +static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { + .mf6c_origin = IN6ADDR_ANY_INIT, + .mf6c_mcastgrp = IN6ADDR_ANY_INIT, +}; + +static struct mr_table_ops ip6mr_mr_table_ops = { + .rht_params = &ip6mr_rht_params, + .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, +}; + static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -310,7 +320,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) if (mrt) return mrt; - return mr_table_alloc(net, id, &ip6mr_rht_params, + return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } @@ -988,33 +998,8 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - return (struct mfc6_cache *)c; - return NULL; -} - -/* Look for a (*,*,oif) entry */ -static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt, - mifi_t mifi) -{ - struct mfc6_cache_cmp_arg arg = { - .mf6c_origin = in6addr_any, - .mf6c_mcastgrp = in6addr_any, - }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (c->mfc_un.res.ttls[mifi] < 255) - return (struct mfc6_cache *)c; - - return NULL; + return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ @@ -1026,26 +1011,10 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - struct mfc6_cache *proxy; if (ipv6_addr_any(mcastgrp)) - goto skip; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) { - if (c->mfc_un.res.ttls[mifi] < 255) - return (struct mfc6_cache *)c; - - /* It's ok if the mifi is part of the static tree */ - proxy = ip6mr_cache_find_any_parent(mrt, c->mfc_parent); - if (proxy && proxy->_c.mfc_un.res.ttls[mifi] < 255) - return (struct mfc6_cache *)c; - } - -skip: - return ip6mr_cache_find_any_parent(mrt, mifi); + return mr_mfc_find_any_parent(mrt, mifi); + return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ @@ -1059,20 +1028,11 @@ ip6mr_cache_find_parent(struct mr_table *mrt, .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; - struct rhlist_head *tmp, *list; - struct mr_mfc *c; - - list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params); - rhl_for_each_entry_rcu(c, tmp, list, mnode) - if (parent == -1 || parent == c->mfc_parent) - return (struct mfc6_cache *)c; - return NULL; + return mr_mfc_find_parent(mrt, &arg, parent); } -/* - * Allocate a multicast cache entry - */ +/* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); @@ -2107,7 +2067,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * interface is part of the static tree. */ rcu_read_lock(); - cache_proxy = ip6mr_cache_find_any_parent(mrt, vif); + cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) { rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From c8d6196803265484f7e1cdd1b00a188dc59a5988 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:36 +0200 Subject: ipmr, ip6mr: Unite mfc seq logic With the exception of the final dump, ipmr and ip6mr have the exact same seq logic for traversing a given mr_table. Refactor that code and make it common. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 69 ++++++++++++++++++++++++++++++++ net/ipv4/ipmr.c | 93 +++---------------------------------------- net/ipv4/ipmr_base.c | 62 +++++++++++++++++++++++++++++ net/ipv6/ip6mr.c | 97 ++++----------------------------------------- 4 files changed, 143 insertions(+), 178 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 46a082e25dab..a007c5ad0fde 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -203,4 +204,72 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } + +#ifdef CONFIG_PROC_FS +struct mr_mfc_iter { + struct seq_net_private p; + struct mr_table *mrt; + struct list_head *cache; + + /* Lock protecting the mr_table's unresolved queue */ + spinlock_t *lock; +}; + +#ifdef CONFIG_IP_MROUTE_COMMON +/* These actually return 'struct mr_mfc *', but to avoid need for explicit + * castings they simply return void. + */ +void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos); +void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos); + +static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, + struct mr_table *mrt, spinlock_t *lock) +{ + struct mr_mfc_iter *it = seq->private; + + it->mrt = mrt; + it->cache = NULL; + it->lock = lock; + + return *pos ? mr_mfc_seq_idx(seq_file_net(seq), + seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct mr_mfc_iter *it = seq->private; + struct mr_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc_unres_queue) + spin_unlock_bh(it->lock); + else if (it->cache == &mrt->mfc_cache_list) + rcu_read_unlock(); +} +#else +static inline void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos) +{ + return NULL; +} + +static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + return NULL; +} + +static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, + struct mr_table *mrt, spinlock_t *lock) +{ + return NULL; +} + +static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) +{ +} +#endif +#endif #endif diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 00898c319cc5..1eb19d9b1be0 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3014,41 +3014,8 @@ static const struct file_operations ipmr_vif_fops = { .release = seq_release_net, }; -struct ipmr_mfc_iter { - struct seq_net_private p; - struct mr_table *mrt; - struct list_head *cache; -}; - -static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, - struct ipmr_mfc_iter *it, loff_t pos) -{ - struct mr_table *mrt = it->mrt; - struct mr_mfc *mfc; - - rcu_read_lock(); - it->cache = &mrt->mfc_cache_list; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - if (pos-- == 0) - return (struct mfc_cache *)mfc; - rcu_read_unlock(); - - spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc_unres_queue; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return (struct mfc_cache *)mfc; - - spin_unlock_bh(&mfc_unres_lock); - - it->cache = NULL; - return NULL; -} - - static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { - struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -3056,57 +3023,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) if (!mrt) return ERR_PTR(-ENOENT); - it->mrt = mrt; - it->cache = NULL; - return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = it->mrt; - struct mfc_cache *mfc = v; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(net, seq->private, 0); - - if (mfc->_c.list.next != it->cache) - return (struct mfc_cache *)(list_entry(mfc->_c.list.next, - struct mr_mfc, list)); - - if (it->cache == &mrt->mfc_unres_queue) - goto end_of_list; - - /* exhausted cache_array, show unresolved */ - rcu_read_unlock(); - it->cache = &mrt->mfc_unres_queue; - - spin_lock_bh(&mfc_unres_lock); - if (!list_empty(it->cache)) - return (struct mfc_cache *)(list_first_entry(it->cache, - struct mr_mfc, - list)); - -end_of_list: - spin_unlock_bh(&mfc_unres_lock); - it->cache = NULL; - - return NULL; -} - -static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) -{ - struct ipmr_mfc_iter *it = seq->private; - struct mr_table *mrt = it->mrt; - - if (it->cache == &mrt->mfc_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_list) - rcu_read_unlock(); + return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -3118,7 +3035,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) "Group Origin Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc_cache *mfc = v; - const struct ipmr_mfc_iter *it = seq->private; + const struct mr_mfc_iter *it = seq->private; const struct mr_table *mrt = it->mrt; seq_printf(seq, "%08X %08X %-3hd", @@ -3152,15 +3069,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, + .next = mr_mfc_seq_next, + .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; static int ipmr_mfc_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + sizeof(struct mr_mfc_iter)); } static const struct file_operations ipmr_mfc_fops = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 172f92acffef..37ad0a793035 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -103,3 +103,65 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) return mr_mfc_find_any_parent(mrt, vifi); } EXPORT_SYMBOL(mr_mfc_find_any); + +#ifdef CONFIG_PROC_FS +void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos) +{ + struct mr_table *mrt = it->mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + it->cache = &mrt->mfc_cache_list; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + if (pos-- == 0) + return mfc; + rcu_read_unlock(); + + spin_lock_bh(it->lock); + it->cache = &mrt->mfc_unres_queue; + list_for_each_entry(mfc, it->cache, list) + if (pos-- == 0) + return mfc; + spin_unlock_bh(it->lock); + + it->cache = NULL; + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_idx); + +void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + struct mr_mfc_iter *it = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = it->mrt; + struct mr_mfc *c = v; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return mr_mfc_seq_idx(net, seq->private, 0); + + if (c->list.next != it->cache) + return list_entry(c->list.next, struct mr_mfc, list); + + if (it->cache == &mrt->mfc_unres_queue) + goto end_of_list; + + /* exhausted cache_array, show unresolved */ + rcu_read_unlock(); + it->cache = &mrt->mfc_unres_queue; + + spin_lock_bh(it->lock); + if (!list_empty(it->cache)) + return list_first_entry(it->cache, struct mr_mfc, list); + +end_of_list: + spin_unlock_bh(it->lock); + it->cache = NULL; + + return NULL; +} +EXPORT_SYMBOL(mr_mfc_seq_next); +#endif diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index ea902a952fb6..26315065e7df 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -333,40 +333,8 @@ static void ip6mr_free_table(struct mr_table *mrt) } #ifdef CONFIG_PROC_FS - -struct ipmr_mfc_iter { - struct seq_net_private p; - struct mr_table *mrt; - struct list_head *cache; -}; - - -static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net, - struct ipmr_mfc_iter *it, loff_t pos) -{ - struct mr_table *mrt = it->mrt; - struct mr_mfc *mfc; - - rcu_read_lock(); - it->cache = &mrt->mfc_cache_list; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - if (pos-- == 0) - return (struct mfc6_cache *)mfc; - rcu_read_unlock(); - - spin_lock_bh(&mfc_unres_lock); - it->cache = &mrt->mfc_unres_queue; - list_for_each_entry(mfc, it->cache, list) - if (pos-- == 0) - return (struct mfc6_cache *)mfc; - spin_unlock_bh(&mfc_unres_lock); - - it->cache = NULL; - return NULL; -} - -/* - * The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif +/* The /proc interfaces to multicast routing + * /proc/ip6_mr_cache /proc/ip6_mr_vif */ struct ipmr_vif_iter { @@ -476,7 +444,6 @@ static const struct file_operations ip6mr_vif_fops = { static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { - struct ipmr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -484,57 +451,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) if (!mrt) return ERR_PTR(-ENOENT); - it->mrt = mrt; - it->cache = NULL; - return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct mfc6_cache *mfc = v; - struct ipmr_mfc_iter *it = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = it->mrt; - - ++*pos; - - if (v == SEQ_START_TOKEN) - return ipmr_mfc_seq_idx(net, seq->private, 0); - - if (mfc->_c.list.next != it->cache) - return (struct mfc6_cache *)(list_entry(mfc->_c.list.next, - struct mr_mfc, list)); - - if (it->cache == &mrt->mfc_unres_queue) - goto end_of_list; - - /* exhausted cache_array, show unresolved */ - rcu_read_unlock(); - it->cache = &mrt->mfc_unres_queue; - - spin_lock_bh(&mfc_unres_lock); - if (!list_empty(it->cache)) - return (struct mfc6_cache *)(list_first_entry(it->cache, - struct mr_mfc, - list)); - - end_of_list: - spin_unlock_bh(&mfc_unres_lock); - it->cache = NULL; - - return NULL; -} - -static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) -{ - struct ipmr_mfc_iter *it = seq->private; - struct mr_table *mrt = it->mrt; - - if (it->cache == &mrt->mfc_unres_queue) - spin_unlock_bh(&mfc_unres_lock); - else if (it->cache == &mrt->mfc_cache_list) - rcu_read_unlock(); + return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -548,7 +465,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; - const struct ipmr_mfc_iter *it = seq->private; + const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", @@ -581,15 +498,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, - .next = ipmr_mfc_seq_next, - .stop = ipmr_mfc_seq_stop, + .next = mr_mfc_seq_next, + .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; static int ipmr_mfc_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_mfc_seq_ops, - sizeof(struct ipmr_mfc_iter)); + sizeof(struct mr_mfc_iter)); } static const struct file_operations ip6mr_mfc_fops = { -- cgit v1.2.3-70-g09d2 From 3feda6b46f734704840685a62b645cbe4efb810c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:37 +0200 Subject: ipmr, ip6mr: Unite vif seq functions Same as previously done with the mfc seq, the logic for the vif seq is refactored to be shared between ipmr and ip6mr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 33 ++++++++++++++++++++++++++++++ net/ipv4/ipmr.c | 49 +++++--------------------------------------- net/ipv4/ipmr_base.c | 33 ++++++++++++++++++++++++++++++ net/ipv6/ip6mr.c | 50 +++++---------------------------------------- 4 files changed, 76 insertions(+), 89 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index a007c5ad0fde..cfaec9bd2d3c 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -206,6 +206,12 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) } #ifdef CONFIG_PROC_FS +struct mr_vif_iter { + struct seq_net_private p; + struct mr_table *mrt; + int ct; +}; + struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; @@ -216,6 +222,16 @@ struct mr_mfc_iter { }; #ifdef CONFIG_IP_MROUTE_COMMON +void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); +void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); + +static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? mr_vif_seq_idx(seq_file_net(seq), + seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ @@ -249,6 +265,23 @@ static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } #else +static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, + loff_t pos) +{ + return NULL; +} + +static inline void *mr_vif_seq_next(struct seq_file *seq, + void *v, loff_t *pos) +{ + return NULL; +} + +static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + return NULL; +} + static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1eb19d9b1be0..f5ff54297824 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2908,31 +2908,11 @@ out: /* The /proc interfaces to multicast routing : * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ -struct ipmr_vif_iter { - struct seq_net_private p; - struct mr_table *mrt; - int ct; -}; - -static struct vif_device *ipmr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) -{ - struct mr_table *mrt = iter->mrt; - - for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - if (pos-- == 0) - return &mrt->vif_table[iter->ct]; - } - return NULL; -} static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -2943,26 +2923,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; read_lock(&mrt_lock); - return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_vif_iter *iter = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = iter->mrt; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ipmr_vif_seq_idx(net, iter, 0); - - while (++iter->ct < mrt->maxvif) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - return &mrt->vif_table[iter->ct]; - } - return NULL; + return mr_vif_seq_start(seq, pos); } static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) @@ -2973,7 +2934,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) static int ipmr_vif_seq_show(struct seq_file *seq, void *v) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { @@ -2996,7 +2957,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ipmr_vif_seq_ops = { .start = ipmr_vif_seq_start, - .next = ipmr_vif_seq_next, + .next = mr_vif_seq_next, .stop = ipmr_vif_seq_stop, .show = ipmr_vif_seq_show, }; @@ -3004,7 +2965,7 @@ static const struct seq_operations ipmr_vif_seq_ops = { static int ipmr_vif_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ipmr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + sizeof(struct mr_vif_iter)); } static const struct file_operations ipmr_vif_fops = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 37ad0a793035..e1b7b639e9b1 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -105,6 +105,39 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) EXPORT_SYMBOL(mr_mfc_find_any); #ifdef CONFIG_PROC_FS +void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) +{ + struct mr_table *mrt = iter->mrt; + + for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + if (pos-- == 0) + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_idx); + +void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mr_vif_iter *iter = seq->private; + struct net *net = seq_file_net(seq); + struct mr_table *mrt = iter->mrt; + + ++*pos; + if (v == SEQ_START_TOKEN) + return mr_vif_seq_idx(net, iter, 0); + + while (++iter->ct < mrt->maxvif) { + if (!VIF_EXISTS(mrt, iter->ct)) + continue; + return &mrt->vif_table[iter->ct]; + } + return NULL; +} +EXPORT_SYMBOL(mr_vif_seq_next); + void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 26315065e7df..ddd9e6bba499 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -337,31 +337,10 @@ static void ip6mr_free_table(struct mr_table *mrt) * /proc/ip6_mr_cache /proc/ip6_mr_vif */ -struct ipmr_vif_iter { - struct seq_net_private p; - struct mr_table *mrt; - int ct; -}; - -static struct vif_device *ip6mr_vif_seq_idx(struct net *net, - struct ipmr_vif_iter *iter, - loff_t pos) -{ - struct mr_table *mrt = iter->mrt; - - for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - if (pos-- == 0) - return &mrt->vif_table[iter->ct]; - } - return NULL; -} - static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(mrt_lock) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; @@ -372,26 +351,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) iter->mrt = mrt; read_lock(&mrt_lock); - return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1) - : SEQ_START_TOKEN; -} - -static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ipmr_vif_iter *iter = seq->private; - struct net *net = seq_file_net(seq); - struct mr_table *mrt = iter->mrt; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip6mr_vif_seq_idx(net, iter, 0); - - while (++iter->ct < mrt->maxvif) { - if (!VIF_EXISTS(mrt, iter->ct)) - continue; - return &mrt->vif_table[iter->ct]; - } - return NULL; + return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) @@ -402,7 +362,7 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { - struct ipmr_vif_iter *iter = seq->private; + struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { @@ -424,7 +384,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, - .next = ip6mr_vif_seq_next, + .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; @@ -432,7 +392,7 @@ static const struct seq_operations ip6mr_vif_seq_ops = { static int ip6mr_vif_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &ip6mr_vif_seq_ops, - sizeof(struct ipmr_vif_iter)); + sizeof(struct mr_vif_iter)); } static const struct file_operations ip6mr_vif_fops = { -- cgit v1.2.3-70-g09d2 From 889cd83cbe411dda854429f3223ab2d31a860a4a Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:38 +0200 Subject: ip6mr: Remove MFC_NOTIFY and refactor flags MFC_NOTIFY exists in ip6mr, probably as some legacy code [was already removed for ipmr in commit 06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum"). Remove it from ip6mr as well, and move the enum into a common file; Notice MFC_OFFLOAD is currently only used by ipmr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 9 --------- include/linux/mroute6.h | 3 --- include/linux/mroute_base.h | 9 +++++++++ net/ipv6/ip6mr.c | 3 --- 4 files changed, 9 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 63b36e6c72a0..7ed82e4f11b3 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -65,15 +65,6 @@ struct vif_entry_notifier_info { #define VIFF_STATIC 0x8000 -/* mfc_flags: - * MFC_STATIC - the entry was added statically (not by a routing daemon) - * MFC_OFFLOAD - the entry was offloaded to the hardware - */ -enum { - MFC_STATIC = BIT(0), - MFC_OFFLOAD = BIT(1), -}; - struct mfc_cache_cmp_arg { __be32 mfc_mcastgrp; __be32 mfc_origin; diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 6acf576fc135..1ac38e6819f5 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -81,9 +81,6 @@ struct mfc6_cache { }; }; -#define MFC_STATIC 1 -#define MFC_NOTIFY 2 - #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cfaec9bd2d3c..f40202b16dae 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -45,6 +45,15 @@ struct vif_device { #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) +/* mfc_flags: + * MFC_STATIC - the entry was added statically (not by a routing daemon) + * MFC_OFFLOAD - the entry was offloaded to the hardware + */ +enum { + MFC_STATIC = BIT(0), + MFC_OFFLOAD = BIT(1), +}; + /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index ddd9e6bba499..c3b3f1c381e1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2203,9 +2203,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, return err; } - if (rtm->rtm_flags & RTM_F_NOTIFY) - cache->_c.mfc_flags |= MFC_NOTIFY; - err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); return err; -- cgit v1.2.3-70-g09d2 From 7b0db85737db3f4d76b2a412e4f19eae59b8b494 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:39 +0200 Subject: ipmr, ip6mr: Unite dumproute flows The various MFC entries are being held in the same kind of mr_tables for both ipmr and ip6mr, and their traversal logic is identical. Also, with the exception of the addresses [and other small tidbits] the major bulk of the nla setting is identical. Unite as much of the dumping as possible between the two. Notice this requires creating an mr_table iterator for each, as the for-each preprocessor macro can't be used by the common logic. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 29 ++++++++ net/ipv4/ipmr.c | 161 +++++++++++--------------------------------- net/ipv4/ipmr_base.c | 123 +++++++++++++++++++++++++++++++++ net/ipv6/ip6mr.c | 156 +++++++++++------------------------------- 4 files changed, 230 insertions(+), 239 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index f40202b16dae..c2560cb50f1d 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -170,6 +170,16 @@ void *mr_mfc_find_parent(struct mr_table *mrt, void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); +int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm); +int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -207,6 +217,25 @@ static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, { return NULL; } + +static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm) +{ + return -EINVAL; +} + +static inline int +mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock) +{ + return -EINVAL; +} #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f5ff54297824..d752a70855d8 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -105,8 +105,6 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mr_mfc *c, struct rtmsg *rtm); static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); @@ -117,6 +115,23 @@ static void ipmr_expire_process(struct timer_list *t); #define ipmr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + struct mr_table *ret; + + if (!mrt) + ret = list_entry_rcu(net->ipv4.mr_tables.next, + struct mr_table, list); + else + ret = list_entry_rcu(mrt->list.next, + struct mr_table, list); + + if (&ret->list == &net->ipv4.mr_tables) + return NULL; + return ret; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -284,6 +299,14 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) +static struct mr_table *ipmr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + if (!mrt) + return net->ipv4.mrt; + return NULL; +} + static struct mr_table *ipmr_get_table(struct net *net, u32 id) { return net->ipv4.mrt; @@ -1051,8 +1074,8 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct iphdr)); - if (__ipmr_fill_mroute(mrt, skb, &c->_c, - nlmsg_data(nlh)) > 0) { + if (mr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { @@ -2256,66 +2279,6 @@ drop: } #endif -static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mr_mfc *c, struct rtmsg *rtm) -{ - struct rta_mfc_stats mfcs; - struct nlattr *mp_attr; - struct rtnexthop *nhp; - unsigned long lastuse; - int ct; - - /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->mfc_parent >= MAXVIFS) { - rtm->rtm_flags |= RTNH_F_UNRESOLVED; - return -ENOENT; - } - - if (VIF_EXISTS(mrt, c->mfc_parent) && - nla_put_u32(skb, RTA_IIF, - mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) - return -EMSGSIZE; - - if (c->mfc_flags & MFC_OFFLOAD) - rtm->rtm_flags |= RTNH_F_OFFLOAD; - - if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH))) - return -EMSGSIZE; - - for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { - struct vif_device *vif; - - if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) { - nla_nest_cancel(skb, mp_attr); - return -EMSGSIZE; - } - - nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; - vif = &mrt->vif_table[ct]; - nhp->rtnh_ifindex = vif->dev->ifindex; - nhp->rtnh_len = sizeof(*nhp); - } - } - - nla_nest_end(skb, mp_attr); - - lastuse = READ_ONCE(c->mfc_un.res.lastuse); - lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), - RTA_PAD)) - return -EMSGSIZE; - - rtm->rtm_type = RTN_MULTICAST; - return 1; -} - int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid) @@ -2373,7 +2336,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb, } read_lock(&mrt_lock); - err = __ipmr_fill_mroute(mrt, skb, &cache->_c, rtm); + err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); rcu_read_unlock(); return err; @@ -2410,7 +2373,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; - err = __ipmr_fill_mroute(mrt, skb, &c->_c, rtm); + err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2423,6 +2386,14 @@ nla_put_failure: return -EMSGSIZE; } +static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, int cmd, + int flags) +{ + return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c, + cmd, flags); +} + static size_t mroute_msgsize(bool unresolved, int maxvif) { size_t len = @@ -2596,62 +2567,8 @@ errout_free: static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - unsigned int t = 0, s_t; - unsigned int e = 0, s_e; - struct mr_table *mrt; - struct mr_mfc *mfc; - - s_t = cb->args[0]; - s_e = cb->args[1]; - - rcu_read_lock(); - ipmr_for_each_table(mrt, net) { - if (t < s_t) - goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { - if (e < s_e) - goto next_entry; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - (struct mfc_cache *)mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = 0; - s_e = 0; - - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (ipmr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - (struct mfc_cache *)mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) { - spin_unlock_bh(&mfc_unres_lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(&mfc_unres_lock); - e = 0; - s_e = 0; -next_table: - t++; - } -done: - rcu_read_unlock(); - - cb->args[1] = e; - cb->args[0] = t; - - return skb->len; + return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, + _ipmr_fill_mroute, &mfc_unres_lock); } static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index e1b7b639e9b1..8ba55bfda817 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -198,3 +198,126 @@ end_of_list: } EXPORT_SYMBOL(mr_mfc_seq_next); #endif + +int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm) +{ + struct rta_mfc_stats mfcs; + struct nlattr *mp_attr; + struct rtnexthop *nhp; + unsigned long lastuse; + int ct; + + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent >= MAXVIFS) { + rtm->rtm_flags |= RTNH_F_UNRESOLVED; + return -ENOENT; + } + + if (VIF_EXISTS(mrt, c->mfc_parent) && + nla_put_u32(skb, RTA_IIF, + mrt->vif_table[c->mfc_parent].dev->ifindex) < 0) + return -EMSGSIZE; + + if (c->mfc_flags & MFC_OFFLOAD) + rtm->rtm_flags |= RTNH_F_OFFLOAD; + + mp_attr = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp_attr) + return -EMSGSIZE; + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) { + struct vif_device *vif; + + nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); + if (!nhp) { + nla_nest_cancel(skb, mp_attr); + return -EMSGSIZE; + } + + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + vif = &mrt->vif_table[ct]; + nhp->rtnh_ifindex = vif->dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + + nla_nest_end(skb, mp_attr); + + lastuse = READ_ONCE(c->mfc_un.res.lastuse); + lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; + + mfcs.mfcs_packets = c->mfc_un.res.pkt; + mfcs.mfcs_bytes = c->mfc_un.res.bytes; + mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || + nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), + RTA_PAD)) + return -EMSGSIZE; + + rtm->rtm_type = RTN_MULTICAST; + return 1; +} +EXPORT_SYMBOL(mr_fill_mroute); + +int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock) +{ + unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1]; + struct net *net = sock_net(skb->sk); + struct mr_table *mrt; + struct mr_mfc *mfc; + + rcu_read_lock(); + for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { + if (t < s_t) + goto next_table; + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + if (e < s_e) + goto next_entry; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + goto done; +next_entry: + e++; + } + e = 0; + s_e = 0; + + spin_lock_bh(lock); + list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { + if (e < s_e) + goto next_entry2; + if (fill(mrt, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, mfc, + RTM_NEWROUTE, NLM_F_MULTI) < 0) { + spin_unlock_bh(lock); + goto done; + } +next_entry2: + e++; + } + spin_unlock_bh(lock); + e = 0; + s_e = 0; +next_table: + t++; + } +done: + rcu_read_unlock(); + + cb->args[1] = e; + cb->args[0] = t; + + return skb->len; +} +EXPORT_SYMBOL(mr_rtm_dumproute); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c3b3f1c381e1..2a38f9de45d3 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -87,8 +87,6 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); -static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc6_cache *c, struct rtmsg *rtm); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); @@ -101,6 +99,23 @@ static void ipmr_expire_process(struct timer_list *t); #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list) +static struct mr_table *ip6mr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + struct mr_table *ret; + + if (!mrt) + ret = list_entry_rcu(net->ipv6.mr6_tables.next, + struct mr_table, list); + else + ret = list_entry_rcu(mrt->list.next, + struct mr_table, list); + + if (&ret->list == &net->ipv6.mr6_tables) + return NULL; + return ret; +} + static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; @@ -247,6 +262,14 @@ static void __net_exit ip6mr_rules_exit(struct net *net) #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) +static struct mr_table *ip6mr_mr_table_iter(struct net *net, + struct mr_table *mrt) +{ + if (!mrt) + return net->ipv6.mrt6; + return NULL; +} + static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; @@ -948,7 +971,8 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); - if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) { + if (mr_fill_mroute(mrt, skb, &c->_c, + nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; @@ -2081,63 +2105,6 @@ int ip6_mr_input(struct sk_buff *skb) return 0; } - -static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, - struct mfc6_cache *c, struct rtmsg *rtm) -{ - struct rta_mfc_stats mfcs; - struct nlattr *mp_attr; - struct rtnexthop *nhp; - unsigned long lastuse; - int ct; - - /* If cache is unresolved, don't try to parse IIF and OIF */ - if (c->_c.mfc_parent >= MAXMIFS) { - rtm->rtm_flags |= RTNH_F_UNRESOLVED; - return -ENOENT; - } - - if (VIF_EXISTS(mrt, c->_c.mfc_parent) && - nla_put_u32(skb, RTA_IIF, - mrt->vif_table[c->_c.mfc_parent].dev->ifindex) < 0) - return -EMSGSIZE; - mp_attr = nla_nest_start(skb, RTA_MULTIPATH); - if (!mp_attr) - return -EMSGSIZE; - - for (ct = c->_c.mfc_un.res.minvif; - ct < c->_c.mfc_un.res.maxvif; ct++) { - if (VIF_EXISTS(mrt, ct) && c->_c.mfc_un.res.ttls[ct] < 255) { - nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); - if (!nhp) { - nla_nest_cancel(skb, mp_attr); - return -EMSGSIZE; - } - - nhp->rtnh_flags = 0; - nhp->rtnh_hops = c->_c.mfc_un.res.ttls[ct]; - nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex; - nhp->rtnh_len = sizeof(*nhp); - } - } - - nla_nest_end(skb, mp_attr); - - lastuse = READ_ONCE(c->_c.mfc_un.res.lastuse); - lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - - mfcs.mfcs_packets = c->_c.mfc_un.res.pkt; - mfcs.mfcs_bytes = c->_c.mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->_c.mfc_un.res.wrong_if; - if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || - nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), - RTA_PAD)) - return -EMSGSIZE; - - rtm->rtm_type = RTN_MULTICAST; - return 1; -} - int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { @@ -2203,7 +2170,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, return err; } - err = __ip6mr_fill_mroute(mrt, skb, cache, rtm); + err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); read_unlock(&mrt_lock); return err; } @@ -2239,7 +2206,7 @@ static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; - err = __ip6mr_fill_mroute(mrt, skb, c, rtm); + err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; @@ -2252,6 +2219,14 @@ nla_put_failure: return -EMSGSIZE; } +static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags) +{ + return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, + cmd, flags); +} + static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = @@ -2365,59 +2340,6 @@ errout: static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { - struct net *net = sock_net(skb->sk); - unsigned int t = 0, s_t; - unsigned int e = 0, s_e; - struct mr_table *mrt; - struct mr_mfc *mfc; - - s_t = cb->args[0]; - s_e = cb->args[1]; - - rcu_read_lock(); - ip6mr_for_each_table(mrt, net) { - if (t < s_t) - goto next_table; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { - if (e < s_e) - goto next_entry; - if (ip6mr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - (struct mfc6_cache *)mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) - goto done; -next_entry: - e++; - } - e = 0; - s_e = 0; - - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { - if (e < s_e) - goto next_entry2; - if (ip6mr_fill_mroute(mrt, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - (struct mfc6_cache *)mfc, - RTM_NEWROUTE, NLM_F_MULTI) < 0) { - spin_unlock_bh(&mfc_unres_lock); - goto done; - } -next_entry2: - e++; - } - spin_unlock_bh(&mfc_unres_lock); - e = s_e = 0; -next_table: - t++; - } -done: - rcu_read_unlock(); - - cb->args[1] = e; - cb->args[0] = t; - - return skb->len; + return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, + _ip6mr_fill_mroute, &mfc_unres_lock); } -- cgit v1.2.3-70-g09d2 From 0f6271264afd975bc599d6f30f3693e9aea57036 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 1 Mar 2018 13:51:26 +0100 Subject: net/smc: cleanup smc_llc.h and smc_clc.h headers Remove structures used internal only from headers. And remove an extra function parameter. Signed-off-by: Stefan Raspl Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 8 ++++---- net/smc/smc_clc.c | 3 +++ net/smc/smc_clc.h | 6 ------ net/smc/smc_llc.c | 30 ++++++++++++++++++++++++++++++ net/smc/smc_llc.h | 28 ---------------------------- 5 files changed, 37 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 38ae22b65e77..b1961a789837 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -312,7 +312,7 @@ out: return rc; } -static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) +static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; @@ -346,7 +346,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid) /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, link->smcibdev->mac[link->ibport - 1], - gid, SMC_LLC_RESP); + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TCL; @@ -498,8 +499,7 @@ static int smc_connect_rdma(struct smc_sock *smc) if (local_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link( - smc, &smcibdev->gid[ibport - 1]); + reason_code = smc_clnt_conf_first_link(smc); if (reason_code < 0) { rc = reason_code; goto out_err_unlock; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 8ac51583a063..dff318a2d5bf 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -22,6 +22,9 @@ #include "smc_clc.h" #include "smc_ib.h" +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; + /* check if received message has a correct header length and contains valid * heading and trailing eyecatchers */ diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index c145a0f36a68..aab3aae2a2ce 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -22,9 +22,6 @@ #define SMC_CLC_CONFIRM 0x03 #define SMC_CLC_DECLINE 0x04 -/* eye catcher "SMCR" EBCDIC for CLC messages */ -static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; - #define SMC_CLC_V1 0x1 /* SMC version */ #define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ #define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ @@ -124,9 +121,6 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } -struct smc_sock; -struct smc_ib_device; - int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 92fe4cc8c82c..e4502bbff33d 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -21,6 +21,36 @@ #include "smc_clc.h" #include "smc_llc.h" +#define SMC_LLC_DATA_LEN 40 + +struct smc_llc_hdr { + struct smc_wr_rx_hdr common; + u8 length; /* 44 */ + u8 reserved; + u8 flags; +}; + +struct smc_llc_msg_confirm_link { /* type 0x01 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 link_uid[SMC_LGR_ID_SIZE]; + u8 max_links; + u8 reserved[9]; +}; + +union smc_llc_msg { + struct smc_llc_msg_confirm_link confirm_link; + struct { + struct smc_llc_hdr hdr; + u8 data[SMC_LLC_DATA_LEN]; + } raw; +}; + +#define SMC_LLC_FLAG_RESP 0x80 + /********************************** send *************************************/ struct smc_llc_tx_pend { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 51b27ce90dbd..a7888607ab53 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -28,34 +28,6 @@ enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, }; -#define SMC_LLC_DATA_LEN 40 - -struct smc_llc_hdr { - struct smc_wr_rx_hdr common; - u8 length; /* 44 */ - u8 reserved; - u8 flags; -}; - -struct smc_llc_msg_confirm_link { /* type 0x01 */ - struct smc_llc_hdr hd; - u8 sender_mac[ETH_ALEN]; - u8 sender_gid[SMC_GID_SIZE]; - u8 sender_qp_num[3]; - u8 link_num; - u8 link_uid[SMC_LGR_ID_SIZE]; - u8 max_links; - u8 reserved[9]; -}; - -union smc_llc_msg { - struct smc_llc_msg_confirm_link confirm_link; - struct { - struct smc_llc_hdr hdr; - u8 data[SMC_LLC_DATA_LEN]; - } raw; -}; - /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); -- cgit v1.2.3-70-g09d2 From 696cd3016975d31e3499c49a7a747d7615a16b3b Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:27 +0100 Subject: net/smc: move netinfo function to file smc_clc.c The function smc_netinfo_by_tcpsk() belongs to CLC handling. Move it to smc_clc.c and rename to smc_clc_netinfo_by_tcpsk. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 42 +----------------------------------------- net/smc/smc.h | 2 -- net/smc/smc_clc.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- net/smc/smc_clc.h | 2 ++ 4 files changed, 45 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b1961a789837..b90cbfdb9916 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include @@ -273,45 +272,6 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* determine subnet and mask of internal TCP socket */ -int smc_netinfo_by_tcpsk(struct socket *clcsock, - __be32 *subnet, u8 *prefix_len) -{ - struct dst_entry *dst = sk_dst_get(clcsock->sk); - struct in_device *in_dev; - struct sockaddr_in addr; - int rc = -ENOENT; - - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } - - /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addr); - /* analyze IPv4 specific data of net_device belonging to TCP socket */ - rcu_read_lock(); - in_dev = __in_dev_get_rcu(dst->dev); - for_ifa(in_dev) { - if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) - continue; - *prefix_len = inet_mask_len(ifa->ifa_mask); - *subnet = ifa->ifa_address & ifa->ifa_mask; - rc = 0; - break; - } endfor_ifa(in_dev); - rcu_read_unlock(); - -out_rel: - dst_release(dst); -out: - return rc; -} - static int smc_clnt_conf_first_link(struct smc_sock *smc) { struct smc_link_group *lgr = smc->conn.lgr; @@ -808,7 +768,7 @@ static void smc_listen_work(struct work_struct *work) } /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); + rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); if (rc) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; diff --git a/net/smc/smc.h b/net/smc/smc.h index 9518986c97b1..9895c190d146 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -263,8 +263,6 @@ static inline bool using_ipsec(struct smc_sock *smc) struct smc_clc_msg_local; -int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, - u8 *prefix_len); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index dff318a2d5bf..874c5a75d6dd 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -11,6 +11,7 @@ */ #include +#include #include #include @@ -73,6 +74,45 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return true; } +/* determine subnet and mask of internal TCP socket */ +int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, + __be32 *subnet, u8 *prefix_len) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct in_device *in_dev; + struct sockaddr_in addr; + int rc = -ENOENT; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + /* get address to which the internal TCP socket is bound */ + kernel_getsockname(clcsock, (struct sockaddr *)&addr); + /* analyze IPv4 specific data of net_device belonging to TCP socket */ + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dst->dev); + for_ifa(in_dev) { + if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) + continue; + *prefix_len = inet_mask_len(ifa->ifa_mask); + *subnet = ifa->ifa_address & ifa->ifa_mask; + rc = 0; + break; + } endfor_ifa(in_dev); + rcu_read_unlock(); + +out_rel: + dst_release(dst); +out: + return rc; +} + /* Wait for data on the tcp-socket, analyze received data * Returns: * 0 if success and it was not a decline that we received. @@ -214,8 +254,8 @@ int smc_clc_send_proposal(struct smc_sock *smc, memset(&pclc_prfx, 0, sizeof(pclc_prfx)); /* determine subnet and mask from internal TCP socket */ - rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, - &pclc_prfx.prefix_len); + rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, + &pclc_prfx.prefix_len); if (rc) return SMC_CLC_DECL_CNFERR; /* configuration error */ pclc_prfx.ipv6_prefixes_cnt = 0; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index aab3aae2a2ce..a20fc75efb24 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -121,6 +121,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } +int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, + u8 *prefix_len); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -- cgit v1.2.3-70-g09d2 From be6d467b997f9e32aa9b27add06e7b0c8627a566 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:28 +0100 Subject: net/smc: remove unused fields from smc structures The daddr field holds the destination IPv4 address. The field was set but never used and can be removed. The addr field was a left-over from an earlier version of non-blocking connects and can be removed. The result of the call to kernel_getpeername is not used, the call can be removed. Non-blocking connects are working, so remove restriction comment. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 15 ++++----------- net/smc/smc.h | 3 +-- net/smc/smc_core.c | 7 +++---- net/smc/smc_core.h | 1 - 4 files changed, 8 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b90cbfdb9916..cda3d5314e3f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -7,7 +7,6 @@ * applicable with RoCE-cards only * * Initial restrictions: - * - non-blocking connect postponed * - IPv6 support postponed * - support for alternate links postponed * - partial support for non-blocking sockets only @@ -345,7 +344,6 @@ static void smc_lgr_forget(struct smc_link_group *lgr) /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc) { - struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr; struct smc_clc_msg_accept_confirm aclc; int local_contact = SMC_FIRST_CONTACT; struct smc_ib_device *smcibdev; @@ -399,8 +397,8 @@ static int smc_connect_rdma(struct smc_sock *smc) srv_first_contact = aclc.hdr.flag; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev, - ibport, &aclc.lcl, srv_first_contact); + local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, + srv_first_contact); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) @@ -518,7 +516,6 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; if (addr->sa_family != AF_INET) goto out_err; - smc->addr = addr; /* needed for nonblocking connect */ lock_sock(sk); switch (sk->sk_state) { @@ -726,7 +723,6 @@ static void smc_listen_work(struct work_struct *work) struct sock *newsmcsk = &new_smc->sk; struct smc_clc_msg_proposal *pclc; struct smc_ib_device *smcibdev; - struct sockaddr_in peeraddr; u8 buf[SMC_CLC_MAX_LEN]; struct smc_link *link; int reason_code = 0; @@ -782,13 +778,10 @@ static void smc_listen_work(struct work_struct *work) goto decline_rdma; } - /* get address of the peer connected to the internal TCP socket */ - kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr); - /* allocate connection / link group */ mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr, - smcibdev, ibport, &pclc->lcl, 0); + local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, + 0); if (local_contact < 0) { rc = local_contact; if (rc == -ENOMEM) diff --git a/net/smc/smc.h b/net/smc/smc.h index 9895c190d146..268cdf11533c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -172,7 +172,6 @@ struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ - struct sockaddr *addr; /* inet connect address */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ @@ -264,7 +263,7 @@ static inline bool using_ipsec(struct smc_sock *smc) struct smc_clc_msg_local; void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, +int smc_conn_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, int srv_first_contact); struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 2424c7100aaf..bc11d06e38ae 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -144,7 +144,7 @@ free: } /* create a new SMC link group */ -static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, +static int smc_lgr_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, char *peer_systemid, unsigned short vlan_id) { @@ -161,7 +161,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, } lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->sync_err = false; - lgr->daddr = peer_in_addr; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); @@ -400,7 +399,7 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) } /* create a new SMC connection (and a new link group if necessary) */ -int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, +int smc_conn_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, int srv_first_contact) { @@ -457,7 +456,7 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, create: if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport, + rc = smc_lgr_create(smc, smcibdev, ibport, lcl->id_for_peer, vlan_id); if (rc) goto out; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fe691bf9af91..7852c3fabf12 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -124,7 +124,6 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smc_link_group { struct list_head list; enum smc_lgr_role role; /* client or server */ - __be32 daddr; /* destination ip address */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ -- cgit v1.2.3-70-g09d2 From 313164da55da0fb24191e729f989f4b2c2793ead Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:29 +0100 Subject: net/smc: respond to test link messages Add TEST LINK message responses, which also serves as preparation for support of sockopt TCP_KEEPALIVE. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_llc.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_llc.h | 3 +++ 2 files changed, 56 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index e4502bbff33d..9e0a556e40c8 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -41,8 +41,15 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ u8 reserved[9]; }; +struct smc_llc_msg_test_link { /* type 0x07 */ + struct smc_llc_hdr hd; + u8 user_data[16]; + u8 reserved[24]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; + struct smc_llc_msg_test_link test_link; struct { struct smc_llc_hdr hdr; u8 data[SMC_LLC_DATA_LEN]; @@ -130,6 +137,30 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], return rc; } +/* send LLC test link request or response */ +int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_test_link *testllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + testllc = (struct smc_llc_msg_test_link *)wr_buf; + memset(testllc, 0, sizeof(*testllc)); + testllc->hd.common.type = SMC_LLC_TEST_LINK; + testllc->hd.length = sizeof(struct smc_llc_msg_test_link); + if (reqresp == SMC_LLC_RESP) + testllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + /********************************* receive ***********************************/ static void smc_llc_rx_confirm_link(struct smc_link *link, @@ -149,6 +180,16 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, } } +static void smc_llc_rx_test_link(struct smc_link *link, + struct smc_llc_msg_test_link *llc) +{ + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP); + } +} + static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; @@ -158,8 +199,15 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ - if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK) + + switch (llc->raw.hdr.common.type) { + case SMC_LLC_TEST_LINK: + smc_llc_rx_test_link(link, &llc->test_link); + break; + case SMC_LLC_CONFIRM_LINK: smc_llc_rx_confirm_link(link, &llc->confirm_link); + break; + } } /***************************** init, exit, misc ******************************/ @@ -169,6 +217,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_CONFIRM_LINK }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index a7888607ab53..6c8a062db4f3 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -26,11 +26,14 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, + SMC_LLC_TEST_LINK = 0x07, }; /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); +int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], + enum smc_llc_reqresp reqresp); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ -- cgit v1.2.3-70-g09d2 From 4ed75de58e9191c011e318dc98b4b157dc633444 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:30 +0100 Subject: net/smc: process confirm/delete rkey messages Process and respond to CONFIRM RKEY and DELETE RKEY messages. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 48 ++++++++++++++---- net/smc/smc_core.h | 2 + net/smc/smc_llc.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/smc/smc_llc.h | 3 ++ 4 files changed, 186 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index bc11d06e38ae..31bb2d1dbd77 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -697,27 +697,55 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } -/* save rkey and dma_addr received from peer during clc handshake */ -int smc_rmb_rtoken_handling(struct smc_connection *conn, - struct smc_clc_msg_accept_confirm *clc) +/* add a new rtoken from peer */ +int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) { - u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr); - struct smc_link_group *lgr = conn->lgr; - u32 rkey = ntohl(clc->rmb_rkey); + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { - conn->rtoken_idx = i; + /* already in list */ + return i; + } + } + i = smc_rmb_reserve_rtoken_idx(lgr); + if (i < 0) + return i; + lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; + lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + return i; +} + +/* delete an rtoken */ +int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +{ + u32 rkey = ntohl(nw_rkey); + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + test_bit(i, lgr->rtokens_used_mask)) { + lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; + lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; + + clear_bit(i, lgr->rtokens_used_mask); return 0; } } - conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr); + return -ENOENT; +} + +/* save rkey and dma_addr received from peer during clc handshake */ +int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc) +{ + conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr; return 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 7852c3fabf12..7be693b940a2 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -189,6 +189,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr); int smc_buf_create(struct smc_sock *smc); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); +int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 9e0a556e40c8..3e47b94608b5 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -47,8 +47,50 @@ struct smc_llc_msg_test_link { /* type 0x07 */ u8 reserved[24]; }; +struct smc_rmb_rtoken { + union { + u8 num_rkeys; /* first rtoken byte of CONFIRM LINK msg */ + /* is actually the num of rtokens, first */ + /* rtoken is always for the current link */ + u8 link_id; /* link id of the rtoken */ + }; + __be32 rmb_key; + __be64 rmb_vaddr; +} __packed; /* format defined in RFC7609 */ + +#define SMC_LLC_RKEYS_PER_MSG 3 + +struct smc_llc_msg_confirm_rkey { /* type 0x06 */ + struct smc_llc_hdr hd; + struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; + u8 reserved; +}; + +struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; +}; + +#define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_NEG 0x20 + +struct smc_llc_msg_delete_rkey { /* type 0x09 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 err_mask; + u8 reserved[2]; + __be32 rkey[8]; + u8 reserved2[4]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; + + struct smc_llc_msg_confirm_rkey confirm_rkey; + struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; + struct smc_llc_msg_delete_rkey delete_rkey; + struct smc_llc_msg_test_link test_link; struct { struct smc_llc_hdr hdr; @@ -161,6 +203,22 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], return rc; } +/* send a prepared message */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +{ + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + memcpy(wr_buf, llcbuf, llclen); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + /********************************* receive ***********************************/ static void smc_llc_rx_confirm_link(struct smc_link *link, @@ -190,6 +248,70 @@ static void smc_llc_rx_test_link(struct smc_link *link, } } +static void smc_llc_rx_confirm_rkey(struct smc_link *link, + struct smc_llc_msg_confirm_rkey *llc) +{ + struct smc_link_group *lgr; + int rc; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + rc = smc_rtoken_add(lgr, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + + /* ignore rtokens for other links, we have only one link */ + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (rc < 0) + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + +static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, + struct smc_llc_msg_confirm_rkey_cont *llc) +{ + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + /* ignore rtokens for other links, we have only one link */ + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + +static void smc_llc_rx_delete_rkey(struct smc_link *link, + struct smc_llc_msg_delete_rkey *llc) +{ + struct smc_link_group *lgr; + u8 err_mask = 0; + int i, max; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + /* unused as long as we don't send this type of msg */ + } else { + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(lgr, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } + + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } + + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + } +} + static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; @@ -207,6 +329,15 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) case SMC_LLC_CONFIRM_LINK: smc_llc_rx_confirm_link(link, &llc->confirm_link); break; + case SMC_LLC_CONFIRM_RKEY: + smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); + break; + case SMC_LLC_CONFIRM_RKEY_CONT: + smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + break; + case SMC_LLC_DELETE_RKEY: + smc_llc_rx_delete_rkey(link, &llc->delete_rkey); + break; } } @@ -221,6 +352,18 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_TEST_LINK }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_CONT + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 6c8a062db4f3..5573f0d0578e 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -26,7 +26,10 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, + SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, + SMC_LLC_CONFIRM_RKEY_CONT = 0x08, + SMC_LLC_DELETE_RKEY = 0x09, }; /* transmit */ -- cgit v1.2.3-70-g09d2 From 75d320d611d8569ebe4e42718de035fcc79f8069 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:31 +0100 Subject: net/smc: do not allow eyecatchers in rmbe SMC does not support eyecatchers in RMB elements, decline peers requesting this support. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 11 +++++++++-- net/smc/smc_clc.h | 1 + net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 16 +++++++++++++++- 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index cda3d5314e3f..0d491f505608 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -291,6 +291,9 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) return rc; } + if (link->llc_confirm_rc) + return SMC_CLC_DECL_RMBE_EC; + rc = smc_ib_modify_qp_rts(link); if (rc) return SMC_CLC_DECL_INTERR; @@ -310,7 +313,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TCL; - return rc; + return 0; } static void smc_conn_save_peer_info(struct smc_sock *smc, @@ -705,9 +708,13 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE); + return rc; } - return rc; + if (link->llc_confirm_resp_rc) + return SMC_CLC_DECL_RMBE_EC; + + return 0; } /* setup for RDMA connection of server */ diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index a20fc75efb24..20e048beac30 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -33,6 +33,7 @@ #define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ #define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */ #define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */ +#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */ struct smc_clc_msg_hdr { /* header1 of clc messages */ u8 eyecatcher[4]; /* eye catcher */ diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 7be693b940a2..2b65b3d7f1f5 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -89,6 +89,8 @@ struct smc_link { u8 link_id; /* unique # within link group */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ + int llc_confirm_rc; /* rc from confirm link msg */ + int llc_confirm_resp_rc; /* rc from conf_resp msg */ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 3e47b94608b5..838a160a3bd9 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -30,6 +30,8 @@ struct smc_llc_hdr { u8 flags; }; +#define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 + struct smc_llc_msg_confirm_link { /* type 0x01 */ struct smc_llc_hdr hd; u8 sender_mac[ETH_ALEN]; @@ -166,6 +168,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], memset(confllc, 0, sizeof(*confllc)); confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(confllc->sender_mac, mac, ETH_ALEN); @@ -225,13 +228,24 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, struct smc_llc_msg_confirm_link *llc) { struct smc_link_group *lgr; + int conf_rc; lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + + /* RMBE eyecatchers are not supported */ + if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) + conf_rc = 0; + else + conf_rc = ENOTSUPP; + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) + if (lgr->role == SMC_SERV) { + link->llc_confirm_resp_rc = conf_rc; complete(&link->llc_confirm_resp); + } } else { if (lgr->role == SMC_CLNT) { + link->llc_confirm_rc = conf_rc; link->link_id = llc->link_num; complete(&link->llc_confirm); } -- cgit v1.2.3-70-g09d2 From 52bedf37bafe1d3bc36d1513ad059d9fd28b3c3f Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:32 +0100 Subject: net/smc: process add/delete link messages Add initial support for the LLC messages ADD LINK and DELETE LINK. Introduce a link state field. Extend the initial LLC handshake with ADD LINK processing. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 42 ++++++++++++++ net/smc/smc_core.c | 3 + net/smc/smc_core.h | 10 ++++ net/smc/smc_llc.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++--- net/smc/smc_llc.h | 7 +++ 5 files changed, 223 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0d491f505608..5267ed19b67d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -313,6 +313,27 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TCL; + /* receive ADD LINK request from server over RoCE fabric */ + rest = wait_for_completion_interruptible_timeout(&link->llc_add, + SMC_LLC_WAIT_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + /* send add link reject message, only one link supported for now */ + rc = smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + link->state = SMC_LNK_ACTIVE; + return 0; } @@ -714,6 +735,27 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) if (link->llc_confirm_resp_rc) return SMC_CLC_DECL_RMBE_EC; + /* send ADD LINK request to client over the RoCE fabric */ + rc = smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + if (rc < 0) + return SMC_CLC_DECL_TCL; + + /* receive ADD LINK response from client over the RoCE fabric */ + rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, + SMC_LLC_WAIT_TIME); + if (rest <= 0) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE); + return rc; + } + + link->state = SMC_LNK_ACTIVE; + return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 31bb2d1dbd77..eb343e15a723 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -176,6 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc, lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ + lnk->state = SMC_LNK_ACTIVATING; lnk->smcibdev = smcibdev; lnk->ibport = ibport; lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; @@ -197,6 +198,8 @@ static int smc_lgr_create(struct smc_sock *smc, goto destroy_qp; init_completion(&lnk->llc_confirm); init_completion(&lnk->llc_confirm_resp); + init_completion(&lnk->llc_add); + init_completion(&lnk->llc_add_resp); smc->conn.lgr = lgr; rwlock_init(&lgr->conns_lock); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 2b65b3d7f1f5..cead2093c4b4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,6 +32,12 @@ enum smc_lgr_role { /* possible roles of a link group */ SMC_SERV /* server */ }; +enum smc_link_state { /* possible states of a link */ + SMC_LNK_INACTIVE, /* link is inactive */ + SMC_LNK_ACTIVATING, /* link is being activated */ + SMC_LNK_ACTIVE /* link is active */ +}; + #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ struct smc_wr_buf { @@ -87,10 +93,14 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + + enum smc_link_state state; /* state of link */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ int llc_confirm_resp_rc; /* rc from conf_resp msg */ + struct completion llc_add; /* wait for rx of add link */ + struct completion llc_add_resp; /* wait for rx of add link rsp*/ }; /* For now we just allow one parallel link per link group. The SMC protocol diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 838a160a3bd9..45b3e0211c39 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -4,9 +4,6 @@ * * Link Layer Control (LLC) * - * For now, we only support the necessary "confirm link" functionality - * which happens for the first RoCE link after successful CLC handshake. - * * Copyright IBM Corp. 2016 * * Author(s): Klaus Wacker @@ -26,7 +23,13 @@ struct smc_llc_hdr { struct smc_wr_rx_hdr common; u8 length; /* 44 */ - u8 reserved; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 add_link_rej_rsn:4, + reserved:4; +#endif u8 flags; }; @@ -43,6 +46,33 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ u8 reserved[9]; }; +#define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 +#define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 + +#define SMC_LLC_ADD_LNK_MAX_LINKS 2 + +struct smc_llc_msg_add_link { /* type 0x02 */ + struct smc_llc_hdr hd; + u8 sender_mac[ETH_ALEN]; + u8 reserved2[2]; + u8 sender_gid[SMC_GID_SIZE]; + u8 sender_qp_num[3]; + u8 link_num; + u8 flags2; /* QP mtu */ + u8 initial_psn[3]; + u8 reserved[8]; +}; + +#define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 +#define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 + +struct smc_llc_msg_del_link { /* type 0x04 */ + struct smc_llc_hdr hd; + u8 link_num; + __be32 reason; + u8 reserved[35]; +} __packed; /* format defined in RFC7609 */ + struct smc_llc_msg_test_link { /* type 0x07 */ struct smc_llc_hdr hd; u8 user_data[16]; @@ -88,6 +118,8 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; + struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; @@ -176,7 +208,64 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], hton24(confllc->sender_qp_num, link->roce_qp->qp_num); /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */ memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LINKS_PER_LGR_MAX; + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send ADD LINK request or response */ +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_add_link *addllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.type = SMC_LLC_ADD_LINK; + addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) { + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + /* always reject more links for now */ + addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + } + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* send DELETE LINK request or response */ +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_del_link *delllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + delllc = (struct smc_llc_msg_del_link *)wr_buf; + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.type = SMC_LLC_DELETE_LINK; + delllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + /* DEL_LINK_ALL because only 1 link supported */ + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc->link_num = link->link_id; /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -239,12 +328,14 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, conf_rc = ENOTSUPP; if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) { + if (lgr->role == SMC_SERV && + link->state == SMC_LNK_ACTIVATING) { link->llc_confirm_resp_rc = conf_rc; complete(&link->llc_confirm_resp); } } else { - if (lgr->role == SMC_CLNT) { + if (lgr->role == SMC_CLNT && + link->state == SMC_LNK_ACTIVATING) { link->llc_confirm_rc = conf_rc; link->link_id = llc->link_num; complete(&link->llc_confirm); @@ -252,6 +343,55 @@ static void smc_llc_rx_confirm_link(struct smc_link *link, } } +static void smc_llc_rx_add_link(struct smc_link *link, + struct smc_llc_msg_add_link *llc) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (link->state == SMC_LNK_ACTIVATING) + complete(&link->llc_add_resp); + } else { + if (link->state == SMC_LNK_ACTIVATING) { + complete(&link->llc_add); + return; + } + + if (lgr->role == SMC_SERV) { + smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_REQ); + + } else { + smc_llc_send_add_link(link, + link->smcibdev->mac[link->ibport - 1], + &link->smcibdev->gid[link->ibport - 1], + SMC_LLC_RESP); + } + } +} + +static void smc_llc_rx_delete_link(struct smc_link *link, + struct smc_llc_msg_del_link *llc) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + + if (llc->hd.flags & SMC_LLC_FLAG_RESP) { + if (lgr->role == SMC_SERV) + smc_lgr_terminate(lgr); + } else { + if (lgr->role == SMC_SERV) { + smc_llc_send_delete_link(link, SMC_LLC_REQ); + } else { + smc_llc_send_delete_link(link, SMC_LLC_RESP); + smc_lgr_terminate(lgr); + } + } +} + static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { @@ -343,6 +483,12 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) case SMC_LLC_CONFIRM_LINK: smc_llc_rx_confirm_link(link, &llc->confirm_link); break; + case SMC_LLC_ADD_LINK: + smc_llc_rx_add_link(link, &llc->add_link); + break; + case SMC_LLC_DELETE_LINK: + smc_llc_rx_delete_link(link, &llc->delete_link); + break; case SMC_LLC_CONFIRM_RKEY: smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); break; @@ -366,6 +512,14 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_TEST_LINK }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK + }, { .handler = smc_llc_rx_handler, .type = SMC_LLC_CONFIRM_RKEY diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 5573f0d0578e..e4a7d5e234d5 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -18,6 +18,7 @@ #define SMC_LLC_FLAG_RESP 0x80 #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) +#define SMC_LLC_WAIT_TIME (2 * HZ) enum smc_llc_reqresp { SMC_LLC_REQ, @@ -26,6 +27,8 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, + SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_DELETE_LINK = 0x04, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, @@ -35,6 +38,10 @@ enum smc_llc_msg_type { /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, + enum smc_llc_reqresp reqresp); +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp); int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], enum smc_llc_reqresp reqresp); int smc_llc_init(void) __init; -- cgit v1.2.3-70-g09d2 From 9651b9346f5bc85a4fef96789c756748483d9ee2 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Thu, 1 Mar 2018 13:51:33 +0100 Subject: net/smc: prevent new connections on link group When the processing of a DELETE LINK message has started, new connections should not be added to the link group that is about to terminate. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 9 --------- net/smc/smc_core.c | 19 ++++++++++--------- net/smc/smc_core.h | 1 + net/smc/smc_llc.c | 1 + 4 files changed, 12 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5267ed19b67d..26684e086750 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -356,15 +356,6 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } -static void smc_lgr_forget(struct smc_link_group *lgr) -{ - spin_lock_bh(&smc_lgr_list.lock); - /* do not use this link group for new connections */ - if (!list_empty(&lgr->list)) - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); -} - /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc) { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index eb343e15a723..702ce5f85e97 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -308,6 +308,15 @@ void smc_lgr_free(struct smc_link_group *lgr) kfree(lgr); } +void smc_lgr_forget(struct smc_link_group *lgr) +{ + spin_lock_bh(&smc_lgr_list.lock); + /* do not use this link group for new connections */ + if (!list_empty(&lgr->list)) + list_del_init(&lgr->list); + spin_unlock_bh(&smc_lgr_list.lock); +} + /* terminate linkgroup abnormally */ void smc_lgr_terminate(struct smc_link_group *lgr) { @@ -315,15 +324,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr) struct smc_sock *smc; struct rb_node *node; - spin_lock_bh(&smc_lgr_list.lock); - if (list_empty(&lgr->list)) { - /* termination already triggered */ - spin_unlock_bh(&smc_lgr_list.lock); - return; - } - /* do not use this link group for new connections */ - list_del_init(&lgr->list); - spin_unlock_bh(&smc_lgr_list.lock); + smc_lgr_forget(lgr); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index cead2093c4b4..07e2a393e6d9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -197,6 +197,7 @@ struct smc_sock; struct smc_clc_msg_accept_confirm; void smc_lgr_free(struct smc_link_group *lgr); +void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); int smc_buf_create(struct smc_sock *smc); int smc_rmb_rtoken_handling(struct smc_connection *conn, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 45b3e0211c39..54e8d6dc9201 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -384,6 +384,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link, smc_lgr_terminate(lgr); } else { if (lgr->role == SMC_SERV) { + smc_lgr_forget(lgr); smc_llc_send_delete_link(link, SMC_LLC_REQ); } else { smc_llc_send_delete_link(link, SMC_LLC_RESP); -- cgit v1.2.3-70-g09d2 From 3a053b1a30dcb4e39569bcce2f4357509260db75 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 28 Feb 2018 15:59:15 +0200 Subject: net: Fix spelling mistake "greater then" -> "greater than" Fix trivial spelling mistake "greater then" -> "greater than". Signed-off-by: Gal Pressman Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- net/core/dev.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e2ab13687fb9..d4907b584b38 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -540,7 +540,7 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb) return false; } -/* Reset all TX qdiscs greater then index of a device. */ +/* Reset all TX qdiscs greater than index of a device. */ static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; diff --git a/net/core/dev.c b/net/core/dev.c index 5bdcc5a161fe..40fb3aed5df2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2378,7 +2378,7 @@ EXPORT_SYMBOL(netdev_set_num_tc); /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues - * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. + * greater than real_num_tx_queues stale skbs on the qdisc must be flushed. */ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { -- cgit v1.2.3-70-g09d2 From f1c02cfb7b30f5c9d9f4e383260abf89d89c3341 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 28 Feb 2018 16:40:08 +0100 Subject: ipv6: allow userspace to add IFA_F_OPTIMISTIC addresses According to RFC 4429 (section 3.1), adding new IPv6 addresses as optimistic addresses is acceptable, as long as the implementation follows some rules: * Optimistic DAD SHOULD only be used when the implementation is aware that the address is based on a most likely unique interface identifier (such as in [RFC2464]), generated randomly [RFC3041], or by a well-distributed hash function [RFC3972] or assigned by Dynamic Host Configuration Protocol for IPv6 (DHCPv6) [RFC3315]. Optimistic DAD SHOULD NOT be used for manually entered addresses. Thus, it seems reasonable to allow userspace to set the optimistic flag when adding new addresses. We must not let userspace set NODAD + OPTIMISTIC, since if the kernel is not performing DAD we would never clear the optimistic flag. We must also ignore userspace's request to add OPTIMISTIC flag to addresses that have already completed DAD (addresses that don't have the TENTATIVE flag, or that have the DADFAILED flag). Then we also need to clear the OPTIMISTIC flag on permanent addresses when DAD fails. Otherwise, IFA_F_OPTIMISTIC addresses added by userspace can still be used after DAD has failed, because in ipv6_chk_addr_and_flags(), IFA_F_OPTIMISTIC overrides IFA_F_TENTATIVE. Setting IFA_F_OPTIMISTIC from userspace is conditional on CONFIG_IPV6_OPTIMISTIC_DAD and the optimistic_dad sysctl. Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4facfe0b1888..b5fd116c046a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1459,6 +1459,21 @@ static bool ipv6_use_optimistic_addr(struct net *net, #endif } +static bool ipv6_allow_optimistic_dad(struct net *net, + struct inet6_dev *idev) +{ +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + if (!idev) + return false; + if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad) + return false; + + return true; +#else + return false; +#endif +} + static int ipv6_get_saddr_eval(struct net *net, struct ipv6_saddr_score *score, struct ipv6_saddr_dst *dst, @@ -1968,6 +1983,8 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) spin_lock_bh(&ifp->lock); addrconf_del_dad_work(ifp); ifp->flags |= IFA_F_TENTATIVE; + if (dad_failed) + ifp->flags &= ~IFA_F_OPTIMISTIC; spin_unlock_bh(&ifp->lock); if (dad_failed) ipv6_ifa_notify(0, ifp); @@ -4501,6 +4518,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) return -EINVAL; + if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) + ifa_flags &= ~IFA_F_OPTIMISTIC; + timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) { expires = jiffies_to_clock_t(timeout * HZ); @@ -4574,6 +4594,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in6_addr *pfx, *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; + struct inet6_dev *idev; u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME; u32 ifa_flags; int err; @@ -4607,7 +4628,19 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, /* We ignore other flags so far. */ ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR | - IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN; + IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + idev = ipv6_find_idev(dev); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + if (!ipv6_allow_optimistic_dad(net, idev)) + ifa_flags &= ~IFA_F_OPTIMISTIC; + + if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) { + NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); + return -EINVAL; + } ifa = ipv6_get_ifaddr(net, pfx, dev, 1); if (!ifa) { -- cgit v1.2.3-70-g09d2 From 7797dc41417eb0e03f39fc4356f7635bcdef108e Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Tue, 27 Feb 2018 18:22:40 -0500 Subject: socket: skip checking sk_err for recvmmsg(MSG_ERRQUEUE) recvmmsg does not call ___sys_recvmsg when sk_err is set. That is fine for normal reads but, for MSG_ERRQUEUE, recvmmsg should always call ___sys_recvmsg regardless of sk->sk_err to be able to clear error queue. Otherwise, users are not able to drain the error queue using recvmmsg. Signed-off-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/socket.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 645d32b4872c..d9a1ac233b35 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2289,10 +2289,12 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (!sock) return err; - err = sock_error(sock->sk); - if (err) { - datagrams = err; - goto out_put; + if (likely(!(flags & MSG_ERRQUEUE))) { + err = sock_error(sock->sk); + if (err) { + datagrams = err; + goto out_put; + } } entry = mmsg; -- cgit v1.2.3-70-g09d2 From dcb8c9b4373a583451b1b8a3e916d33de273633d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Feb 2018 14:40:46 -0800 Subject: tcp_bbr: better deal with suboptimal GSO (II) This is second part of dealing with suboptimal device gso parameters. In first patch (350c9f484bde "tcp_bbr: better deal with suboptimal GSO") we dealt with devices having low gso_max_segs Some devices lower gso_max_size from 64KB to 16 KB (r8152 is an example) In order to probe an optimal cwnd, we want BBR being not sensitive to whatever GSO constraint a device can have. This patch removes tso_segs_goal() CC callback in favor of min_tso_segs() for CC wanting to override sysctl_tcp_min_tso_segs Next patch will remove bbr->tso_segs_goal since it does not have to be persistent. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 6 ++---- net/ipv4/tcp_bbr.c | 23 +++++++++++++---------- net/ipv4/tcp_output.c | 15 ++++++++------- 3 files changed, 23 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 92b06c6e7732..9c9b3768b350 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -511,8 +511,6 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ -u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - int min_tso_segs); void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); @@ -981,8 +979,8 @@ struct tcp_congestion_ops { u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* suggest number of segments for each skb to transmit (optional) */ - u32 (*tso_segs_goal)(struct sock *sk); + /* override sysctl_tcp_min_tso_segs */ + u32 (*min_tso_segs)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a471f696e13c..afc0567b8a98 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -261,23 +261,26 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* Return count of segments we want in the skbs we send, or 0 for default. */ -static u32 bbr_tso_segs_goal(struct sock *sk) +/* override sysctl_tcp_min_tso_segs */ +static u32 bbr_min_tso_segs(struct sock *sk) { - struct bbr *bbr = inet_csk_ca(sk); - - return bbr->tso_segs_goal; + return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } static void bbr_set_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 min_segs; + u32 segs, bytes; + + /* Sort of tcp_tso_autosize() but ignoring + * driver provided sk_gso_max_size. + */ + bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; - bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs), - 0x7FU); + bbr->tso_segs_goal = min(segs, 0x7FU); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -936,7 +939,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .tso_segs_goal = bbr_tso_segs_goal, + .min_tso_segs = bbr_min_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 49d043de3476..383cac0ff0ec 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1703,8 +1703,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, /* Return how many segs we'd like on a TSO packet, * to send one TSO packet per ms */ -u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - int min_tso_segs) +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + int min_tso_segs) { u32 bytes, segs; @@ -1720,7 +1720,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, return segs; } -EXPORT_SYMBOL(tcp_tso_autosize); /* Return the number of segments we want in the skb we are transmitting. * See if congestion control module wants to decide; otherwise, autosize. @@ -1728,11 +1727,13 @@ EXPORT_SYMBOL(tcp_tso_autosize); static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + u32 min_tso, tso_segs; - if (!tso_segs) - tso_segs = tcp_tso_autosize(sk, mss_now, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + min_tso = ca_ops->min_tso_segs ? + ca_ops->min_tso_segs(sk) : + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs; + + tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } -- cgit v1.2.3-70-g09d2 From 71abf467bb630c7e2f4ef33267d98fb7d10d3ce9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Feb 2018 14:40:47 -0800 Subject: tcp_bbr: remove bbr->tso_segs_goal Its value is computed then immediately used, there is no need to store it. Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index afc0567b8a98..c92014cb1e16 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -97,10 +97,9 @@ struct bbr { packet_conservation:1, /* use packet conservation? */ restore_cwnd:1, /* decided to revert cwnd to old value */ round_start:1, /* start of packet-timed tx->ack round? */ - tso_segs_goal:7, /* segments we want in each skb we send */ idle_restart:1, /* restarting after idle? */ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ - unused:5, + unused:12, lt_is_sampling:1, /* taking long-term ("LT") samples now? */ lt_rtt_cnt:7, /* round trips in long-term interval */ lt_use_bw:1; /* use lt_bw as our bw estimate? */ @@ -267,10 +266,9 @@ static u32 bbr_min_tso_segs(struct sock *sk) return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } -static void bbr_set_tso_segs_goal(struct sock *sk) +static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); u32 segs, bytes; /* Sort of tcp_tso_autosize() but ignoring @@ -280,7 +278,7 @@ static void bbr_set_tso_segs_goal(struct sock *sk) GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - bbr->tso_segs_goal = min(segs, 0x7FU); + return min(segs, 0x7FU); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -351,7 +349,7 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; /* Allow enough full-sized skbs in flight to utilize end systems. */ - cwnd += 3 * bbr->tso_segs_goal; + cwnd += 3 * bbr_tso_segs_goal(sk); /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ cwnd = (cwnd + 1) & ~1U; @@ -827,7 +825,6 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs) bw = bbr_bw(sk); bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); - bbr_set_tso_segs_goal(sk); bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); } @@ -837,7 +834,6 @@ static void bbr_init(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->prior_cwnd = 0; - bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ bbr->rtt_cnt = 0; bbr->next_rtt_delivered = 0; bbr->prev_ca_state = TCP_CA_Open; -- cgit v1.2.3-70-g09d2 From f84af33138a827dbdf7ee0df2ed82377b43cd1cc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:10 +0800 Subject: sctp: factor out sctp_sendmsg_to_asoc from sctp_sendmsg This patch is to move the codes for checking and sending on one asoc after this asoc has been found or created into sctp_sendmsg_to_asoc. Note that 'err != -ESRCH' check is for the case that asoc is freed when waiting for tx buffer in sctp_sendmsg_to_asoc. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 230 +++++++++++++++++++++++------------------------------- 1 file changed, 99 insertions(+), 131 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf271f8c2dc9..183129e2bc68 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1606,6 +1606,100 @@ static int sctp_error(struct sock *sk, int flags, int err) static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); +static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, + struct msghdr *msg, size_t msg_len, + struct sctp_transport *transport, + struct sctp_sndrcvinfo *sinfo) +{ + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + struct sctp_datamsg *datamsg; + bool wait_connect = false; + struct sctp_chunk *chunk; + long timeo; + int err; + + if (sinfo->sinfo_stream >= asoc->stream.outcnt) { + err = -EINVAL; + goto err; + } + + if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { + err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); + if (err) + goto err; + } + + if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) { + err = -EMSGSIZE; + goto err; + } + + if (sctp_state(asoc, CLOSED)) { + err = sctp_primitive_ASSOCIATE(net, asoc, NULL); + if (err) + goto err; + + if (sctp_sk(sk)->strm_interleave) { + timeo = sock_sndtimeo(sk, 0); + err = sctp_wait_for_connect(asoc, &timeo); + if (err) + goto err; + } else { + wait_connect = true; + } + + pr_debug("%s: we associated primitively\n", __func__); + } + + if (asoc->pmtu_pending) + sctp_assoc_pending_pmtu(asoc); + + if (sctp_wspace(asoc) < msg_len) + sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); + + if (!sctp_wspace(asoc)) { + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + if (err) + goto err; + } + + datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); + if (IS_ERR(datamsg)) { + err = PTR_ERR(datamsg); + goto err; + } + + asoc->force_delay = !!(msg->msg_flags & MSG_MORE); + + list_for_each_entry(chunk, &datamsg->chunks, frag_list) { + sctp_chunk_hold(chunk); + sctp_set_owner_w(chunk); + chunk->transport = transport; + } + + err = sctp_primitive_SEND(net, asoc, datamsg); + if (err) { + sctp_datamsg_free(datamsg); + goto err; + } + + pr_debug("%s: we sent primitively\n", __func__); + + sctp_datamsg_put(datamsg); + + if (unlikely(wait_connect)) { + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + sctp_wait_for_connect(asoc, &timeo); + } + + err = msg_len; + +err: + return err; +} + static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct net *net = sock_net(sk); @@ -1622,11 +1716,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) sctp_assoc_t associd = 0; struct sctp_cmsgs cmsgs = { NULL }; enum sctp_scope scope; - bool fill_sinfo_ttl = false, wait_connect = false; - struct sctp_datamsg *datamsg; - int msg_flags = msg->msg_flags; + bool fill_sinfo_ttl = false; __u16 sinfo_flags = 0; - long timeo; int err; err = 0; @@ -1923,49 +2014,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_free; } - if (asoc->pmtu_pending) - sctp_assoc_pending_pmtu(asoc); - - /* If fragmentation is disabled and the message length exceeds the - * association fragmentation point, return EMSGSIZE. The I-D - * does not specify what this error is, but this looks like - * a great fit. - */ - if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) { - err = -EMSGSIZE; - goto out_free; - } - - /* Check for invalid stream. */ - if (sinfo->sinfo_stream >= asoc->stream.outcnt) { - err = -EINVAL; - goto out_free; - } - - /* Allocate sctp_stream_out_ext if not already done */ - if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) { - err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream); - if (err) - goto out_free; - } - - if (sctp_wspace(asoc) < msg_len) - sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - if (!sctp_wspace(asoc)) { - /* sk can be changed by peel off when waiting for buf. */ - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) { - if (err == -ESRCH) { - /* asoc is already dead. */ - new_asoc = NULL; - err = -EPIPE; - } - goto out_free; - } - } - /* If an address is passed with the sendto/sendmsg call, it is used * to override the primary destination address in the TCP model, or * when SCTP_ADDR_OVER flag is set in the UDP model. @@ -1980,96 +2028,16 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) } else chunk_tp = NULL; - /* Auto-connect, if we aren't connected already. */ - if (sctp_state(asoc, CLOSED)) { - err = sctp_primitive_ASSOCIATE(net, asoc, NULL); - if (err < 0) - goto out_free; - - /* If stream interleave is enabled, wait_connect has to be - * done earlier than data enqueue, as it needs to make data - * or idata according to asoc->intl_enable which is set - * after connection is done. - */ - if (sctp_sk(asoc->base.sk)->strm_interleave) { - timeo = sock_sndtimeo(sk, 0); - err = sctp_wait_for_connect(asoc, &timeo); - if (err) - goto out_unlock; - } else { - wait_connect = true; - } - - pr_debug("%s: we associated primitively\n", __func__); - } - - /* Break the message into multiple chunks of maximum size. */ - datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); - if (IS_ERR(datamsg)) { - err = PTR_ERR(datamsg); - goto out_free; - } - asoc->force_delay = !!(msg->msg_flags & MSG_MORE); - - /* Now send the (possibly) fragmented message. */ - list_for_each_entry(chunk, &datamsg->chunks, frag_list) { - sctp_chunk_hold(chunk); - - /* Do accounting for the write space. */ - sctp_set_owner_w(chunk); - - chunk->transport = chunk_tp; - } - - /* Send it to the lower layers. Note: all chunks - * must either fail or succeed. The lower layer - * works that way today. Keep it that way or this - * breaks. - */ - err = sctp_primitive_SEND(net, asoc, datamsg); - /* Did the lower layer accept the chunk? */ - if (err) { - sctp_datamsg_free(datamsg); - goto out_free; - } - - pr_debug("%s: we sent primitively\n", __func__); - - sctp_datamsg_put(datamsg); - err = msg_len; - - if (unlikely(wait_connect)) { - timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT); - sctp_wait_for_connect(asoc, &timeo); - } - - /* If we are already past ASSOCIATE, the lower - * layers are responsible for association cleanup. - */ - goto out_unlock; + /* Send msg to the asoc */ + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, chunk_tp, sinfo); out_free: - if (new_asoc) + if (err < 0 && err != -ESRCH && new_asoc) sctp_association_free(asoc); out_unlock: release_sock(sk); - out_nounlock: - return sctp_error(sk, msg_flags, err); - -#if 0 -do_sock_err: - if (msg_len) - err = msg_len; - else - err = sock_error(sk); - goto out; - -do_interrupted: - if (msg_len) - err = msg_len; - goto out; -#endif /* 0 */ + return sctp_error(sk, msg->msg_flags, err); } /* This is an extended version of skb_pull() that removes the data from the -- cgit v1.2.3-70-g09d2 From 2bfd80f9edbfc18c6247be929df3b348329141a0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:11 +0800 Subject: sctp: factor out sctp_sendmsg_new_asoc from sctp_sendmsg This patch is to move the codes for creating a new asoc if no asoc was found into sctp_sendmsg_new_asoc. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 201 +++++++++++++++++++++++------------------------------- 1 file changed, 86 insertions(+), 115 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 183129e2bc68..58bb55dce8f6 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1606,6 +1606,87 @@ static int sctp_error(struct sock *sk, int flags, int err) static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); +static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, + struct sctp_cmsgs *cmsgs, + union sctp_addr *daddr, + struct sctp_transport **tp) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct net *net = sock_net(sk); + struct sctp_association *asoc; + enum sctp_scope scope; + int err = -EINVAL; + + *tp = NULL; + + if (sflags & (SCTP_EOF | SCTP_ABORT)) + return -EINVAL; + + if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) || + sctp_sstate(sk, CLOSING))) + return -EADDRNOTAVAIL; + + if (sctp_endpoint_is_peeled_off(ep, daddr)) + return -EADDRNOTAVAIL; + + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; + } else { + if (ep->base.bind_addr.port < inet_prot_sock(net) && + !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) + return -EACCES; + } + + scope = sctp_scope(daddr); + + asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!asoc) + return -ENOMEM; + + if (sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL) < 0) { + err = -ENOMEM; + goto free; + } + + if (cmsgs->init) { + struct sctp_initmsg *init = cmsgs->init; + + if (init->sinit_num_ostreams) { + __u16 outcnt = init->sinit_num_ostreams; + + asoc->c.sinit_num_ostreams = outcnt; + /* outcnt has been changed, need to re-init stream */ + err = sctp_stream_init(&asoc->stream, outcnt, 0, + GFP_KERNEL); + if (err) + goto free; + } + + if (init->sinit_max_instreams) + asoc->c.sinit_max_instreams = init->sinit_max_instreams; + + if (init->sinit_max_attempts) + asoc->max_init_attempts = init->sinit_max_attempts; + + if (init->sinit_max_init_timeo) + asoc->max_init_timeo = + msecs_to_jiffies(init->sinit_max_init_timeo); + } + + *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN); + if (!*tp) { + err = -ENOMEM; + goto free; + } + + return 0; + +free: + sctp_association_free(asoc); + return err; +} + static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, struct msghdr *msg, size_t msg_len, struct sctp_transport *transport, @@ -1715,7 +1796,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) struct sctp_initmsg *sinit; sctp_assoc_t associd = 0; struct sctp_cmsgs cmsgs = { NULL }; - enum sctp_scope scope; bool fill_sinfo_ttl = false; __u16 sinfo_flags = 0; int err; @@ -1817,20 +1897,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) if (msg_name) { /* Look for a matching association on the endpoint. */ asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); - - /* If we could not find a matching association on the - * endpoint, make sure that it is not a TCP-style - * socket that already has an association or there is - * no peeled-off association on another socket. - */ - if (!asoc && - ((sctp_style(sk, TCP) && - (sctp_sstate(sk, ESTABLISHED) || - sctp_sstate(sk, CLOSING))) || - sctp_endpoint_is_peeled_off(ep, &to))) { - err = -EADDRNOTAVAIL; - goto out_unlock; - } } else { asoc = sctp_id2assoc(sk, associd); if (!asoc) { @@ -1879,108 +1945,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Do we need to create the association? */ if (!asoc) { - pr_debug("%s: there is no association yet\n", __func__); - - if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) { - err = -EINVAL; - goto out_unlock; - } - - /* Check for invalid stream against the stream counts, - * either the default or the user specified stream counts. - */ - if (sinfo) { - if (!sinit || !sinit->sinit_num_ostreams) { - /* Check against the defaults. */ - if (sinfo->sinfo_stream >= - sp->initmsg.sinit_num_ostreams) { - err = -EINVAL; - goto out_unlock; - } - } else { - /* Check against the requested. */ - if (sinfo->sinfo_stream >= - sinit->sinit_num_ostreams) { - err = -EINVAL; - goto out_unlock; - } - } - } - - /* - * API 3.1.2 bind() - UDP Style Syntax - * If a bind() or sctp_bindx() is not called prior to a - * sendmsg() call that initiates a new association, the - * system picks an ephemeral port and will choose an address - * set equivalent to binding with a wildcard address. - */ - if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) { - err = -EAGAIN; - goto out_unlock; - } - } else { - /* - * If an unprivileged user inherits a one-to-many - * style socket with open associations on a privileged - * port, it MAY be permitted to accept new associations, - * but it SHOULD NOT be permitted to open new - * associations. - */ - if (ep->base.bind_addr.port < inet_prot_sock(net) && - !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) { - err = -EACCES; - goto out_unlock; - } - } - - scope = sctp_scope(&to); - new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); - if (!new_asoc) { - err = -ENOMEM; + err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, &to, + &transport); + if (err) goto out_unlock; - } - asoc = new_asoc; - err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); - if (err < 0) { - err = -ENOMEM; - goto out_free; - } - - /* If the SCTP_INIT ancillary data is specified, set all - * the association init values accordingly. - */ - if (sinit) { - if (sinit->sinit_num_ostreams) { - __u16 outcnt = sinit->sinit_num_ostreams; - - asoc->c.sinit_num_ostreams = outcnt; - /* outcnt has been changed, so re-init stream */ - err = sctp_stream_init(&asoc->stream, outcnt, 0, - GFP_KERNEL); - if (err) - goto out_free; - } - if (sinit->sinit_max_instreams) { - asoc->c.sinit_max_instreams = - sinit->sinit_max_instreams; - } - if (sinit->sinit_max_attempts) { - asoc->max_init_attempts - = sinit->sinit_max_attempts; - } - if (sinit->sinit_max_init_timeo) { - asoc->max_init_timeo = - msecs_to_jiffies(sinit->sinit_max_init_timeo); - } - } - /* Prime the peer's transport structures. */ - transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN); - if (!transport) { - err = -ENOMEM; - goto out_free; - } + asoc = transport->asoc; + new_asoc = asoc; } /* ASSERT: we have a valid association at this point. */ -- cgit v1.2.3-70-g09d2 From c2666de1fde3a02583c2c4d4af4bc54f7252e891 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:12 +0800 Subject: sctp: factor out sctp_sendmsg_check_sflags from sctp_sendmsg This patch is to move the codes for checking sinfo_flags on one asoc after this asoc has been found into sctp_sendmsg_check_sflags. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 72 +++++++++++++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 58bb55dce8f6..93cff9963614 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1687,6 +1687,39 @@ free: return err; } +static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, + __u16 sflags, struct msghdr *msg, + size_t msg_len) +{ + struct sock *sk = asoc->base.sk; + struct net *net = sock_net(sk); + + if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) + return -EPIPE; + + if (sflags & SCTP_EOF) { + pr_debug("%s: shutting down association:%p\n", __func__, asoc); + sctp_primitive_SHUTDOWN(net, asoc, NULL); + + return 0; + } + + if (sflags & SCTP_ABORT) { + struct sctp_chunk *chunk; + + chunk = sctp_make_abort_user(asoc, msg, msg_len); + if (!chunk) + return -ENOMEM; + + pr_debug("%s: aborting association:%p\n", __func__, asoc); + sctp_primitive_ABORT(net, asoc, chunk); + + return 0; + } + + return 1; +} + static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, struct msghdr *msg, size_t msg_len, struct sctp_transport *transport, @@ -1783,12 +1816,10 @@ err: static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { - struct net *net = sock_net(sk); struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *new_asoc = NULL, *asoc = NULL; struct sctp_transport *transport, *chunk_tp; - struct sctp_chunk *chunk; union sctp_addr to; struct sockaddr *msg_name = NULL; struct sctp_sndrcvinfo default_sinfo; @@ -1906,41 +1937,10 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) } if (asoc) { - pr_debug("%s: just looked up association:%p\n", __func__, asoc); - - /* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED - * socket that has an association in CLOSED state. This can - * happen when an accepted socket has an association that is - * already CLOSED. - */ - if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) { - err = -EPIPE; - goto out_unlock; - } - - if (sinfo_flags & SCTP_EOF) { - pr_debug("%s: shutting down association:%p\n", - __func__, asoc); - - sctp_primitive_SHUTDOWN(net, asoc, NULL); - err = 0; + err = sctp_sendmsg_check_sflags(asoc, sinfo_flags, msg, + msg_len); + if (err <= 0) goto out_unlock; - } - if (sinfo_flags & SCTP_ABORT) { - - chunk = sctp_make_abort_user(asoc, msg, msg_len); - if (!chunk) { - err = -ENOMEM; - goto out_unlock; - } - - pr_debug("%s: aborting association:%p\n", - __func__, asoc); - - sctp_primitive_ABORT(net, asoc, chunk); - err = 0; - goto out_unlock; - } } /* Do we need to create the association? */ -- cgit v1.2.3-70-g09d2 From becef9b1e249c849ca15889586e02668d6d723a2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:13 +0800 Subject: sctp: factor out sctp_sendmsg_get_daddr from sctp_sendmsg This patch is to move the codes for trying to get daddr from msg->msg_name into sctp_sendmsg_get_daddr. Note that after adding 'daddr', 'to' and 'msg_name' can be deleted. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 58 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 93cff9963614..68691d2b3167 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1814,14 +1814,35 @@ err: return err; } +static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, + const struct msghdr *msg, + struct sctp_cmsgs *cmsgs) +{ + union sctp_addr *daddr = NULL; + int err; + + if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { + int len = msg->msg_namelen; + + if (len > sizeof(*daddr)) + len = sizeof(*daddr); + + daddr = (union sctp_addr *)msg->msg_name; + + err = sctp_verify_addr(sk, daddr, len); + if (err) + return ERR_PTR(err); + } + + return daddr; +} + static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_sock *sp; struct sctp_endpoint *ep; struct sctp_association *new_asoc = NULL, *asoc = NULL; struct sctp_transport *transport, *chunk_tp; - union sctp_addr to; - struct sockaddr *msg_name = NULL; struct sctp_sndrcvinfo default_sinfo; struct sctp_sndrcvinfo *sinfo; struct sctp_initmsg *sinit; @@ -1829,6 +1850,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) struct sctp_cmsgs cmsgs = { NULL }; bool fill_sinfo_ttl = false; __u16 sinfo_flags = 0; + union sctp_addr *daddr; int err; err = 0; @@ -1851,23 +1873,11 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_nounlock; } - /* Fetch the destination address for this packet. This - * address only selects the association--it is not necessarily - * the address we will send to. - * For a peeled-off socket, msg_name is ignored. - */ - if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { - int msg_namelen = msg->msg_namelen; - - err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name, - msg_namelen); - if (err) - return err; - - if (msg_namelen > sizeof(to)) - msg_namelen = sizeof(to); - memcpy(&to, msg->msg_name, msg_namelen); - msg_name = msg->msg_name; + /* Get daddr from msg */ + daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); + if (IS_ERR(daddr)) { + err = PTR_ERR(daddr); + goto out_nounlock; } sinit = cmsgs.init; @@ -1925,9 +1935,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) lock_sock(sk); /* If a msg_name has been specified, assume this is to be used. */ - if (msg_name) { + if (daddr) { /* Look for a matching association on the endpoint. */ - asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); + asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); } else { asoc = sctp_id2assoc(sk, associd); if (!asoc) { @@ -1945,7 +1955,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Do we need to create the association? */ if (!asoc) { - err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, &to, + err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, daddr, &transport); if (err) goto out_unlock; @@ -1989,9 +1999,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) * to override the primary destination address in the TCP model, or * when SCTP_ADDR_OVER flag is set in the UDP model. */ - if ((sctp_style(sk, TCP) && msg_name) || + if ((sctp_style(sk, TCP) && daddr) || (sinfo_flags & SCTP_ADDR_OVER)) { - chunk_tp = sctp_assoc_lookup_paddr(asoc, &to); + chunk_tp = sctp_assoc_lookup_paddr(asoc, daddr); if (!chunk_tp) { err = -EINVAL; goto out_free; -- cgit v1.2.3-70-g09d2 From 204f817fb9ab1b42c34c27d593cad2992d575f71 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:14 +0800 Subject: sctp: factor out sctp_sendmsg_parse from sctp_sendmsg This patch is to move the codes for parsing msghdr and checking sk into sctp_sendmsg_parse. Note that different from before, 'sinfo' in sctp_sendmsg won't be NULL any more. It gets the value either from cmsgs->srinfo, cmsgs->sinfo or asoc. With it, the 'sinfo' and 'fill_sinfo_ttl' check can be removed from sctp_sendmsg. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 172 ++++++++++++++++++++++-------------------------------- 1 file changed, 69 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 68691d2b3167..bf089e59c792 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1606,6 +1606,61 @@ static int sctp_error(struct sock *sk, int flags, int err) static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs); +static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, + struct sctp_sndrcvinfo *srinfo, + const struct msghdr *msg, size_t msg_len) +{ + __u16 sflags; + int err; + + if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP)) + return -EPIPE; + + if (msg_len > sk->sk_sndbuf) + return -EMSGSIZE; + + memset(cmsgs, 0, sizeof(*cmsgs)); + err = sctp_msghdr_parse(msg, cmsgs); + if (err) { + pr_debug("%s: msghdr parse err:%x\n", __func__, err); + return err; + } + + memset(srinfo, 0, sizeof(*srinfo)); + if (cmsgs->srinfo) { + srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream; + srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags; + srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid; + srinfo->sinfo_context = cmsgs->srinfo->sinfo_context; + srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id; + srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive; + } + + if (cmsgs->sinfo) { + srinfo->sinfo_stream = cmsgs->sinfo->snd_sid; + srinfo->sinfo_flags = cmsgs->sinfo->snd_flags; + srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid; + srinfo->sinfo_context = cmsgs->sinfo->snd_context; + srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; + } + + sflags = srinfo->sinfo_flags; + if (!sflags && msg_len) + return 0; + + if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT))) + return -EINVAL; + + if (((sflags & SCTP_EOF) && msg_len > 0) || + (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0)) + return -EINVAL; + + if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name) + return -EINVAL; + + return 0; +} + static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_cmsgs *cmsgs, union sctp_addr *daddr, @@ -1839,39 +1894,23 @@ static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { - struct sctp_sock *sp; - struct sctp_endpoint *ep; + struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *new_asoc = NULL, *asoc = NULL; struct sctp_transport *transport, *chunk_tp; - struct sctp_sndrcvinfo default_sinfo; - struct sctp_sndrcvinfo *sinfo; - struct sctp_initmsg *sinit; + struct sctp_sndrcvinfo _sinfo, *sinfo; sctp_assoc_t associd = 0; struct sctp_cmsgs cmsgs = { NULL }; - bool fill_sinfo_ttl = false; __u16 sinfo_flags = 0; union sctp_addr *daddr; int err; - err = 0; - sp = sctp_sk(sk); - ep = sp->ep; - - pr_debug("%s: sk:%p, msg:%p, msg_len:%zu ep:%p\n", __func__, sk, - msg, msg_len, ep); - - /* We cannot send a message over a TCP-style listening socket. */ - if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { - err = -EPIPE; + /* Parse and get snd_info */ + err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len); + if (err) goto out_nounlock; - } - /* Parse out the SCTP CMSGs. */ - err = sctp_msghdr_parse(msg, &cmsgs); - if (err) { - pr_debug("%s: msghdr parse err:%x\n", __func__, err); - goto out_nounlock; - } + sinfo = &_sinfo; + sinfo_flags = sinfo->sinfo_flags; /* Get daddr from msg */ daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); @@ -1880,58 +1919,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) goto out_nounlock; } - sinit = cmsgs.init; - if (cmsgs.sinfo != NULL) { - memset(&default_sinfo, 0, sizeof(default_sinfo)); - default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid; - default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags; - default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid; - default_sinfo.sinfo_context = cmsgs.sinfo->snd_context; - default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id; - - sinfo = &default_sinfo; - fill_sinfo_ttl = true; - } else { - sinfo = cmsgs.srinfo; - } - /* Did the user specify SNDINFO/SNDRCVINFO? */ - if (sinfo) { - sinfo_flags = sinfo->sinfo_flags; - associd = sinfo->sinfo_assoc_id; - } - - pr_debug("%s: msg_len:%zu, sinfo_flags:0x%x\n", __func__, - msg_len, sinfo_flags); - - /* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */ - if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) { - err = -EINVAL; - goto out_nounlock; - } - - /* If SCTP_EOF is set, no data can be sent. Disallow sending zero - * length messages when SCTP_EOF|SCTP_ABORT is not set. - * If SCTP_ABORT is set, the message length could be non zero with - * the msg_iov set to the user abort reason. - */ - if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) || - (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) { - err = -EINVAL; - goto out_nounlock; - } - - /* If SCTP_ADDR_OVER is set, there must be an address - * specified in msg_name. - */ - if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) { - err = -EINVAL; - goto out_nounlock; - } - - transport = NULL; - - pr_debug("%s: about to look up association\n", __func__); - lock_sock(sk); /* If a msg_name has been specified, assume this is to be used. */ @@ -1964,36 +1951,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) new_asoc = asoc; } - /* ASSERT: we have a valid association at this point. */ - pr_debug("%s: we have a valid association\n", __func__); - - if (!sinfo) { - /* If the user didn't specify SNDINFO/SNDRCVINFO, make up - * one with some defaults. - */ - memset(&default_sinfo, 0, sizeof(default_sinfo)); - default_sinfo.sinfo_stream = asoc->default_stream; - default_sinfo.sinfo_flags = asoc->default_flags; - default_sinfo.sinfo_ppid = asoc->default_ppid; - default_sinfo.sinfo_context = asoc->default_context; - default_sinfo.sinfo_timetolive = asoc->default_timetolive; - default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); - - sinfo = &default_sinfo; - } else if (fill_sinfo_ttl) { - /* In case SNDINFO was specified, we still need to fill - * it with a default ttl from the assoc here. - */ - sinfo->sinfo_timetolive = asoc->default_timetolive; + if (!cmsgs.srinfo && !cmsgs.sinfo) { + sinfo->sinfo_stream = asoc->default_stream; + sinfo->sinfo_ppid = asoc->default_ppid; + sinfo->sinfo_context = asoc->default_context; + sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); } - /* API 7.1.7, the sndbuf size per association bounds the - * maximum size of data that can be sent in a single send call. - */ - if (msg_len > sk->sk_sndbuf) { - err = -EMSGSIZE; - goto out_free; - } + if (!cmsgs.srinfo) + sinfo->sinfo_timetolive = asoc->default_timetolive; /* If an address is passed with the sendto/sendmsg call, it is used * to override the primary destination address in the TCP model, or -- cgit v1.2.3-70-g09d2 From d42cb06e5b542cabea88c9074b2abf90d43757f7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:15 +0800 Subject: sctp: factor out sctp_sendmsg_update_sinfo from sctp_sendmsg This patch is to move the codes for trying to get sinfo from asoc into sctp_sendmsg_update_sinfo. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf089e59c792..bd1a657117f1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1892,6 +1892,21 @@ static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk, return daddr; } +static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct sctp_cmsgs *cmsgs) +{ + if (!cmsgs->srinfo && !cmsgs->sinfo) { + sinfo->sinfo_stream = asoc->default_stream; + sinfo->sinfo_ppid = asoc->default_ppid; + sinfo->sinfo_context = asoc->default_context; + sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); + } + + if (!cmsgs->srinfo) + sinfo->sinfo_timetolive = asoc->default_timetolive; +} + static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; @@ -1951,15 +1966,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) new_asoc = asoc; } - if (!cmsgs.srinfo && !cmsgs.sinfo) { - sinfo->sinfo_stream = asoc->default_stream; - sinfo->sinfo_ppid = asoc->default_ppid; - sinfo->sinfo_context = asoc->default_context; - sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); - } - - if (!cmsgs.srinfo) - sinfo->sinfo_timetolive = asoc->default_timetolive; + /* Update snd_info with the asoc */ + sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); /* If an address is passed with the sendto/sendmsg call, it is used * to override the primary destination address in the TCP model, or -- cgit v1.2.3-70-g09d2 From 8e87c6eb18f9d7427054f28a284d495c174d9970 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:16 +0800 Subject: sctp: remove the unnecessary transport looking up from sctp_sendmsg Now sctp_assoc_lookup_paddr can only be called only if daddr has been set. But if daddr has been set, sctp_endpoint_lookup_assoc would be done, where it could already have the transport. So this unnecessary transport looking up should be removed, but only reset transport as NULL when SCTP_ADDR_OVER is not set for UDP type socket. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bd1a657117f1..426031095afe 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1911,7 +1911,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; struct sctp_association *new_asoc = NULL, *asoc = NULL; - struct sctp_transport *transport, *chunk_tp; + struct sctp_transport *transport = NULL; struct sctp_sndrcvinfo _sinfo, *sinfo; sctp_assoc_t associd = 0; struct sctp_cmsgs cmsgs = { NULL }; @@ -1966,29 +1966,17 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) new_asoc = asoc; } + if (!sctp_style(sk, TCP) && !(sinfo_flags & SCTP_ADDR_OVER)) + transport = NULL; + /* Update snd_info with the asoc */ sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); - /* If an address is passed with the sendto/sendmsg call, it is used - * to override the primary destination address in the TCP model, or - * when SCTP_ADDR_OVER flag is set in the UDP model. - */ - if ((sctp_style(sk, TCP) && daddr) || - (sinfo_flags & SCTP_ADDR_OVER)) { - chunk_tp = sctp_assoc_lookup_paddr(asoc, daddr); - if (!chunk_tp) { - err = -EINVAL; - goto out_free; - } - } else - chunk_tp = NULL; - /* Send msg to the asoc */ - err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, chunk_tp, sinfo); - -out_free: + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo); if (err < 0 && err != -ESRCH && new_asoc) sctp_association_free(asoc); + out_unlock: release_sock(sk); out_nounlock: -- cgit v1.2.3-70-g09d2 From 007b7e18be74a49b61f89664966ac1477e1c9608 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:17 +0800 Subject: sctp: improve some variables in sctp_sendmsg This patch mostly is to: - rename sinfo_flags as sflags, to make the indents look better, and also keep consistent with other sctp_sendmsg_xx functions. - replace new_asoc with bool new, no need to define a pointer here, as if new_asoc is set, it must be asoc. - rename the 'out_nounlock:' as 'out', shorter and nicer. - remove associd, only one place is using it now, just use sinfo->sinfo_assoc_id directly. - remove 'cmsgs' initialization in sctp_sendmsg, as it will be done in sctp_sendmsg_parse. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 426031095afe..a1c78fc19d95 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1910,28 +1910,28 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) { struct sctp_endpoint *ep = sctp_sk(sk)->ep; - struct sctp_association *new_asoc = NULL, *asoc = NULL; struct sctp_transport *transport = NULL; struct sctp_sndrcvinfo _sinfo, *sinfo; - sctp_assoc_t associd = 0; - struct sctp_cmsgs cmsgs = { NULL }; - __u16 sinfo_flags = 0; + struct sctp_association *asoc; + struct sctp_cmsgs cmsgs; union sctp_addr *daddr; + bool new = false; + __u16 sflags; int err; /* Parse and get snd_info */ err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len); if (err) - goto out_nounlock; + goto out; sinfo = &_sinfo; - sinfo_flags = sinfo->sinfo_flags; + sflags = sinfo->sinfo_flags; /* Get daddr from msg */ daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs); if (IS_ERR(daddr)) { err = PTR_ERR(daddr); - goto out_nounlock; + goto out; } lock_sock(sk); @@ -1941,7 +1941,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Look for a matching association on the endpoint. */ asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); } else { - asoc = sctp_id2assoc(sk, associd); + asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id); if (!asoc) { err = -EPIPE; goto out_unlock; @@ -1949,24 +1949,23 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) } if (asoc) { - err = sctp_sendmsg_check_sflags(asoc, sinfo_flags, msg, - msg_len); + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } /* Do we need to create the association? */ if (!asoc) { - err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, daddr, + err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, &transport); if (err) goto out_unlock; asoc = transport->asoc; - new_asoc = asoc; + new = true; } - if (!sctp_style(sk, TCP) && !(sinfo_flags & SCTP_ADDR_OVER)) + if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) transport = NULL; /* Update snd_info with the asoc */ @@ -1974,12 +1973,12 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) /* Send msg to the asoc */ err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo); - if (err < 0 && err != -ESRCH && new_asoc) + if (err < 0 && err != -ESRCH && new) sctp_association_free(asoc); out_unlock: release_sock(sk); -out_nounlock: +out: return sctp_error(sk, msg->msg_flags, err); } -- cgit v1.2.3-70-g09d2 From 0a3920d28b5490e6c90110156135dbc12c7fc413 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 1 Mar 2018 23:05:18 +0800 Subject: sctp: adjust some codes in a better order in sctp_sendmsg sctp_sendmsg_new_asoc and SCTP_ADDR_OVER check is only necessary when daddr is set, so move them up to if (daddr) statement. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index a1c78fc19d95..7fa76031bb08 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1936,38 +1936,38 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) lock_sock(sk); - /* If a msg_name has been specified, assume this is to be used. */ + /* Get and check or create asoc */ if (daddr) { - /* Look for a matching association on the endpoint. */ asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (asoc) { + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, + msg_len); + if (err <= 0) + goto out_unlock; + } else { + err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, + &transport); + if (err) + goto out_unlock; + + asoc = transport->asoc; + new = true; + } + + if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) + transport = NULL; } else { asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id); if (!asoc) { err = -EPIPE; goto out_unlock; } - } - if (asoc) { err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len); if (err <= 0) goto out_unlock; } - /* Do we need to create the association? */ - if (!asoc) { - err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr, - &transport); - if (err) - goto out_unlock; - - asoc = transport->asoc; - new = true; - } - - if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER)) - transport = NULL; - /* Update snd_info with the asoc */ sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); -- cgit v1.2.3-70-g09d2 From 7efc0b6b666d757e07417f59397e7f5f340e74e0 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:12 -0800 Subject: net/ipv4: Pass net to fib_multipath_hash instead of fib_info fib_multipath_hash only needs net struct to check a sysctl. Make it clear by passing net instead of fib_info. In the end this allows alignment between the ipv4 and ipv6 versions. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/route.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 8812582a94d5..7c7522e8585b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -395,7 +395,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); #ifdef CONFIG_IP_ROUTE_MULTIPATH -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); #endif void fib_select_multipath(struct fib_result *res, int hash); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 181b0d8d589c..e7c602c600ac 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result *res, #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, fl4, skb, NULL); + int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3bb686dac273..1689c569bbc3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1782,10 +1782,9 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, } /* if skb is set it will be used and fl4 can be NULL */ -int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, +int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { - struct net *net = fi->fib_net; struct flow_keys hash_keys; u32 mhash; @@ -1852,7 +1851,7 @@ static int ip_mkroute_input(struct sk_buff *skb, { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h = fib_multipath_hash(res->fi, NULL, skb, hkeys); + int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h); } -- cgit v1.2.3-70-g09d2 From 6f74b6c259158d89ad258cda8e86240e01052884 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:13 -0800 Subject: net: Align ip_multipath_l3_keys and ip6_multipath_l3_keys Symmetry is good and allows easy comparison that ipv4 and ipv6 are doing the same thing. To that end, change ip_multipath_l3_keys to set addresses at the end after the icmp compares, and move the initialization of ipv6 flow keys to rt6_multipath_hash. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/route.c | 20 +++++++++++--------- net/ipv6/route.c | 4 ++-- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1689c569bbc3..df57bc68e8e0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1748,37 +1748,39 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); + const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; - hash_keys->addrs.v4addrs.src = outer_iph->saddr; - hash_keys->addrs.v4addrs.dst = outer_iph->daddr; if (likely(outer_iph->protocol != IPPROTO_ICMP)) - return; + goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - return; + goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - return; + goto out; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB) - return; + goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - return; - hash_keys->addrs.v4addrs.src = inner_iph->saddr; - hash_keys->addrs.v4addrs.dst = inner_iph->daddr; + goto out; + + key_iph = inner_iph; +out: + hash_keys->addrs.v4addrs.src = key_iph->saddr; + hash_keys->addrs.v4addrs.dst = key_iph->daddr; } /* if skb is set it will be used and fl4 can be NULL */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e2bb40824c85..190d9690dfe0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1815,8 +1815,6 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb, key_iph = inner_iph; _flkeys = NULL; out: - memset(keys, 0, sizeof(*keys)); - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; @@ -1837,6 +1835,8 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys hash_keys; if (skb) { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); return flow_hash_from_keys(&hash_keys) >> 1; } -- cgit v1.2.3-70-g09d2 From ec7127a5d53d891b77a11433026dca72c57eaf88 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:14 -0800 Subject: net/ipv4: Simplify fib_multipath_hash with optional flow keys As of commit e37b1e978bec5 ("ipv6: route: dissect flow in input path if fib rules need it") fib_multipath_hash takes an optional flow keys. If non-NULL it means the skb has already been dissected. If not set, then fib_multipath_hash needs to call skb_flow_dissect_flow_keys. Simplify the logic by setting flkeys to the local stack variable keys. Simplifies fib_multipath_hash by only have 1 set of instructions setting hash_keys. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv4/route.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df57bc68e8e0..6a7b3cba3972 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1810,24 +1810,20 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); - if (flkeys) { - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; - hash_keys.ports.src = flkeys->ports.src; - hash_keys.ports.dst = flkeys->ports.dst; - hash_keys.basic.ip_proto = flkeys->basic.ip_proto; - } else { + if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; - hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; - hash_keys.ports.src = keys.ports.src; - hash_keys.ports.dst = keys.ports.dst; - hash_keys.basic.ip_proto = keys.basic.ip_proto; + flkeys = &keys; } + + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; -- cgit v1.2.3-70-g09d2 From 9a2a537acc75dfd19c4358bc9cb6042bdc60698c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:15 -0800 Subject: net/ipv6: Make rt6_multipath_hash similar to fib_multipath_hash Make rt6_multipath_hash more of a direct parallel to fib_multipath_hash and reduce stack and overhead in the process: get_hash_from_flowi6 is just a wrapper around __get_hash_from_flowi6 with another stack allocation for flow_keys. Move setting the addresses, protocol and label into rt6_multipath_hash and allow it to make the call to flow_hash_from_keys. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/ipv6/route.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 190d9690dfe0..5c89af2c54f4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1833,15 +1833,21 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; + u32 mhash; + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { - memset(&hash_keys, 0, sizeof(hash_keys)); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - return flow_hash_from_keys(&hash_keys) >> 1; + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; } + mhash = flow_hash_from_keys(&hash_keys); - return get_hash_from_flowi6(fl6) >> 1; + return mhash >> 1; } void ip6_route_input(struct sk_buff *skb) -- cgit v1.2.3-70-g09d2 From 3192dac64c73d8c0eb4274a3da23d829fb5177af Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:16 -0800 Subject: net: Rename NETEVENT_MULTIPATH_HASH_UPDATE Rename NETEVENT_MULTIPATH_HASH_UPDATE to NETEVENT_IPV4_MPATH_HASH_UPDATE to denote it relates to a change in the IPv4 hash policy. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- include/net/netevent.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 69f16c605b9d..93d48c1b2bf8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2430,7 +2430,7 @@ static int mlxsw_sp_router_netevent_event(struct notifier_block *nb, mlxsw_core_schedule_work(&net_work->work); mlxsw_sp_port_dev_put(mlxsw_sp_port); break; - case NETEVENT_MULTIPATH_HASH_UPDATE: + case NETEVENT_IPV4_MPATH_HASH_UPDATE: net = ptr; if (!net_eq(net, &init_net)) diff --git a/include/net/netevent.h b/include/net/netevent.h index 40e7bab68490..baee605a94ab 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -26,7 +26,7 @@ enum netevent_notif_type { NETEVENT_NEIGH_UPDATE = 1, /* arg is struct neighbour ptr */ NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ - NETEVENT_MULTIPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 89683d868b37..011de9a20ec6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -400,7 +400,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) - call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net); + call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); return ret; } -- cgit v1.2.3-70-g09d2 From b75cc8f90f07342467b3bd51dbc0054f185032c9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:17 -0800 Subject: net/ipv6: Pass skb to route lookup IPv6 does path selection for multipath routes deep in the lookup functions. The next patch adds L4 hash option and needs the skb for the forward path. To get the skb to the relevant FIB lookup functions it needs to go through the fib rules layer, so add a lookup_data argument to the fib_lookup_arg struct. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/infiniband/core/cma.c | 2 +- drivers/net/ipvlan/ipvlan_core.c | 3 +- drivers/net/vrf.c | 7 ++-- include/net/fib_rules.h | 1 + include/net/ip6_fib.h | 4 ++- include/net/ip6_route.h | 11 +++--- net/ipv6/anycast.c | 2 +- net/ipv6/fib6_rules.c | 8 +++-- net/ipv6/icmp.c | 3 +- net/ipv6/ip6_fib.c | 3 +- net/ipv6/ip6_gre.c | 2 +- net/ipv6/ip6_tunnel.c | 4 +-- net/ipv6/ip6_vti.c | 2 +- net/ipv6/mcast.c | 4 +-- net/ipv6/netfilter/ip6t_rpfilter.c | 2 +- net/ipv6/netfilter/nft_fib_ipv6.c | 3 +- net/ipv6/route.c | 72 +++++++++++++++++++++++--------------- net/ipv6/seg6_local.c | 4 +-- 18 files changed, 83 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 3ae32d1ddd27..915bbd867b61 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1334,7 +1334,7 @@ static bool validate_ipv6_net_dev(struct net_device *net_dev, IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, - strict); + NULL, strict); bool ret; if (!rt) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 17daebd19e65..1a8132eb2a3e 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -817,7 +817,8 @@ struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb, }; skb_dst_drop(skb); - dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, flags); + dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6, + skb, flags); skb_dst_set(skb, dst); break; } diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e459e601c57f..c6be49d3a9eb 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -941,6 +941,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, + const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); @@ -959,7 +960,7 @@ static struct rt6_info *vrf_ip6_route_lookup(struct net *net, if (!table) return NULL; - return ip6_pol_route(net, table, ifindex, fl6, flags); + return ip6_pol_route(net, table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, @@ -977,7 +978,7 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; - rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, + rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; @@ -1110,7 +1111,7 @@ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, flags); + rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 1c9e17c11953..e5cfcfc7dd93 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -47,6 +47,7 @@ struct fib_rule { struct fib_lookup_arg { void *lookup_ptr; + const void *lookup_data; void *result; struct fib_rule *rule; u32 table; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 8d906a35b534..5e86fd9dc857 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -350,7 +350,8 @@ struct fib6_table { typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, - struct flowi6 *, int); + struct flowi6 *, + const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ @@ -364,6 +365,7 @@ struct fib6_entry_notifier_info { struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup); struct fib6_node *fib6_lookup(struct fib6_node *root, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index da2bde5fda8f..9594f9317952 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -75,7 +75,8 @@ static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt) void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags); + struct flowi6 *fl6, + const struct sk_buff *skb, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); @@ -88,9 +89,10 @@ static inline struct dst_entry *ip6_route_output(struct net *net, } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags); + const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int ifindex, struct flowi6 *fl6, int flags); + int ifindex, struct flowi6 *fl6, + const struct sk_buff *skb, int flags); void ip6_route_init_special_entries(void); int ip6_route_init(void); @@ -126,7 +128,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, } struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int flags); + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int flags); u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *hkeys); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index d7d0abc7fd0e..c61718dba2e6 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -78,7 +78,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 04e5f523e50f..00ef9467f3c0 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -61,11 +61,13 @@ unsigned int fib6_rules_seq_read(struct net *net) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .lookup_data = skb, .flags = FIB_LOOKUP_NOREF, }; @@ -80,11 +82,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, } else { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put(rt); @@ -130,7 +132,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } - rt = lookup(net, table, flp6, flags); + rt = lookup(net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct fib6_rule *r = (struct fib6_rule *)rule; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index b0778d323b6e..a5d929223820 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -629,7 +629,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0); + rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, + skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cab95cf3b39f..2f995e9e3050 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -299,11 +299,12 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; - rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put(rt); rt = net->ipv6.ip6_null_entry; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4f150a394387..83c7766c8c75 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1053,7 +1053,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 869e2e6750f7..1124f310df5a 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -679,7 +679,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, - NULL, 0, 0); + NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -1444,7 +1444,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (!rt) return; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index c617ea17faa8..a482b854eeea 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -645,7 +645,7 @@ static void vti6_link_config(struct ip6_tnl *t) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, - p->link, strict); + p->link, NULL, strict); if (rt) tdev = rt->dst.dev; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d9bb933dd5c4..d1a0cefac273 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -165,7 +165,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) if (ifindex == 0) { struct rt6_info *rt; - rt = rt6_lookup(net, addr, NULL, 0, 0); + rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; ip6_rt_put(rt); @@ -254,7 +254,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, struct inet6_dev *idev = NULL; if (ifindex == 0) { - struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0); + struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0); if (rt) { dev = rt->dst.dev; diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index 94deb69bbbda..910a27318f58 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, lookup_flags |= RT6_LOOKUP_F_IFACE; } - rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); if (rt->dst.error) goto out; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index cc5174c7254c..3230b3d7b11b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -181,7 +181,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, *dest = 0; again: - rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags); + rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, + lookup_flags); if (rt->dst.error) goto put_rt_err; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5c89af2c54f4..d2b8368663cb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -452,6 +452,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) static struct rt6_info *rt6_multipath_select(struct rt6_info *match, struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) { struct rt6_info *sibling, *next_sibling; @@ -460,7 +461,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL); + fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -914,7 +915,9 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { struct rt6_info *rt, *rt_cache; struct fib6_node *fn; @@ -929,8 +932,8 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, - fl6->flowi6_oif, flags); + rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, + skb, flags); } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -954,14 +957,15 @@ restart: } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, - int flags) + const struct sk_buff *skb, int flags) { - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, - const struct in6_addr *saddr, int oif, int strict) + const struct in6_addr *saddr, int oif, + const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, @@ -975,7 +979,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, flags |= RT6_LOOKUP_F_HAS_SADDR; } - dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); + dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return (struct rt6_info *) dst; @@ -1647,7 +1651,8 @@ void rt6_age_exceptions(struct rt6_info *rt, } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int flags) + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct fib6_node *fn, *saved_fn; struct rt6_info *rt, *rt_cache; @@ -1669,7 +1674,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, strict); + rt = rt6_multipath_select(rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1768,20 +1773,25 @@ uncached_rt_out: } EXPORT_SYMBOL_GPL(ip6_pol_route); -static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_input(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, - struct flowi6 *fl6, int flags) + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); + return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); @@ -1876,13 +1886,17 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); + skb_dst_set(skb, + ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } -static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, - struct flowi6 *fl6, int flags) +static struct rt6_info *ip6_pol_route_output(struct net *net, + struct fib6_table *table, + struct flowi6 *fl6, + const struct sk_buff *skb, + int flags) { - return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); + return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, @@ -1910,7 +1924,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, else if (sk) flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); - return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); + return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } EXPORT_SYMBOL_GPL(ip6_route_output_flags); @@ -2159,6 +2173,7 @@ struct ip6rd_flowi { static struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; @@ -2232,8 +2247,9 @@ out: }; static struct dst_entry *ip6_route_redirect(struct net *net, - const struct flowi6 *fl6, - const struct in6_addr *gateway) + const struct flowi6 *fl6, + const struct sk_buff *skb, + const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; @@ -2241,7 +2257,7 @@ static struct dst_entry *ip6_route_redirect(struct net *net, rdfl.fl6 = *fl6; rdfl.gateway = *gateway; - return fib6_rule_lookup(net, &rdfl.fl6, + return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } @@ -2261,7 +2277,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_uid = uid; - dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2283,7 +2299,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, fl6.saddr = iph->daddr; fl6.flowi6_uid = sock_net_uid(net, NULL); - dst = ip6_route_redirect(net, &fl6, &iph->saddr); + dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } @@ -2485,7 +2501,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); /* if table lookup failed, fall back to full lookup */ if (rt == net->ipv6.ip6_null_entry) { @@ -2548,7 +2564,7 @@ static int ip6_route_check_nh(struct net *net, } if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); if (!grt) goto out; @@ -4613,7 +4629,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; - dst = ip6_route_input_lookup(net, dev, &fl6, flags); + dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ba3767ef5e93..45722327375a 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -161,7 +161,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; if (!tbl_id) { - dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags); + dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags); } else { struct fib6_table *table; @@ -169,7 +169,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!table) goto out; - rt = ip6_pol_route(net, table, 0, &fl6, flags); + rt = ip6_pol_route(net, table, 0, &fl6, skb, flags); dst = &rt->dst; } -- cgit v1.2.3-70-g09d2 From b4bac172e90ce4a93df8adf44eb70d91b9d611eb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:18 -0800 Subject: net/ipv6: Add support for path selection using hash of 5-tuple Some operators prefer IPv6 path selection to use a standard 5-tuple hash rather than just an L3 hash with the flow the label. To that end add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb ("net: ipv4: add support for ECMP hash policy choice"). The default is still L3 which covers source and destination addresses along with flow label and IPv6 protocol. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 ++++ include/net/ip6_route.h | 4 +- include/net/netevent.h | 1 + include/net/netns/ipv6.h | 1 + net/ipv6/icmp.c | 2 +- net/ipv6/route.c | 68 ++++++++++++++++++++++++++-------- net/ipv6/sysctl_net_ipv6.c | 27 ++++++++++++++ 7 files changed, 91 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a553d4e4a0fb..783675a730e5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN FALSE: disabled Default: FALSE +fib_multipath_hash_policy - INTEGER + Controls which hash policy to use for multipath routes. + Default: 0 (Layer 3) + Possible values: + 0 - Layer 3 (source and destination addresses plus flow label) + 1 - Layer 4 (standard 5-tuple) + anycast_src_echo_reply - BOOLEAN Controls the use of anycast addresses as source addresses for ICMPv6 echo reply diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9594f9317952..ce2abc0ff102 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -130,8 +130,8 @@ static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int flags); -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *hkeys); +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); diff --git a/include/net/netevent.h b/include/net/netevent.h index baee605a94ab..d9918261701c 100644 --- a/include/net/netevent.h +++ b/include/net/netevent.h @@ -27,6 +27,7 @@ enum netevent_notif_type { NETEVENT_REDIRECT, /* arg is struct netevent_redirect ptr */ NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */ NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */ + NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */ }; int register_netevent_notifier(struct notifier_block *nb); diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e286fda09fcf..5b51110435fc 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; + int multipath_hash_policy; int flowlabel_consistency; int auto_flowlabels; int icmpv6_time; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a5d929223820..6f84668be6ea 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); sk = icmpv6_xmit_lock(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d2b8368663cb..f0ae58424c45 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct rt6_info *rt6_multipath_select(struct rt6_info *match, +static struct rt6_info *rt6_multipath_select(const struct net *net, + struct rt6_info *match, struct flowi6 *fl6, int oif, const struct sk_buff *skb, int strict) @@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash) - fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL); + fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound)) return match; @@ -932,7 +933,7 @@ restart: rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, + rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif, skb, flags); } if (rt == net->ipv6.ip6_null_entry) { @@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) - rt = rt6_multipath_select(rt, fl6, oif, skb, strict); + rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict); if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1839,21 +1840,56 @@ out: } /* if skb is set it will be used and fl6 can be NULL */ -u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb, - struct flow_keys *flkeys) +u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, + const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash; - memset(&hash_keys, 0, sizeof(hash_keys)); - hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - if (skb) { - ip6_multipath_l3_keys(skb, &hash_keys, flkeys); - } else { - hash_keys.addrs.v6addrs.src = fl6->saddr; - hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; - hash_keys.basic.ip_proto = fl6->flowi6_proto; + switch (net->ipv6.sysctl.multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } else { + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)fl6->flowlabel; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; + case 1: + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + + memset(&hash_keys, 0, sizeof(hash_keys)); + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, flag); + flkeys = &keys; + } + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.ports.src = flkeys->ports.src; + hash_keys.ports.dst = flkeys->ports.dst; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.ports.src = fl6->fl6_sport; + hash_keys.ports.dst = fl6->fl6_dport; + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); @@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) - fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys); + fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 262f791f1b9b..966c42af92f4 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -16,14 +16,31 @@ #include #include #include +#include #ifdef CONFIG_NETLABEL #include #endif +static int zero; static int one = 1; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; +static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct net *net; + int ret; + + net = container_of(table->data, struct net, + ipv6.sysctl.multipath_hash_policy); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); + + return ret; +} static struct ctl_table ipv6_table_template[] = { { @@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv6.sysctl.multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_rt6_multipath_hash_policy, + .extra1 = &zero, + .extra2 = &one, + }, { } }; @@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt; ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len; ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len; + ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy, ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3-70-g09d2 From de7a0f871fabe74fff7481caf7d3efe03b58fe58 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 2 Mar 2018 08:32:20 -0800 Subject: net: Remove unused get_hash_from_flow functions __get_hash_from_flowi6 is still used for flowlabels, but the IPv4 variant and the wrappers to both are not used. Remove them. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/flow.h | 16 ---------------- net/core/flow_dissector.c | 16 ---------------- 2 files changed, 32 deletions(-) (limited to 'net') diff --git a/include/net/flow.h b/include/net/flow.h index 64e7ee9cb980..8ce21793094e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -222,20 +222,4 @@ static inline unsigned int flow_key_size(u16 family) __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6) -{ - struct flow_keys keys; - - return __get_hash_from_flowi6(fl6, &keys); -} - -__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys); - -static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4) -{ - struct flow_keys keys; - - return __get_hash_from_flowi4(fl4, &keys); -} - #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 559db9ea8d86..d29f09bc5ff9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) } EXPORT_SYMBOL(__get_hash_from_flowi6); -__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) -{ - memset(keys, 0, sizeof(*keys)); - - keys->addrs.v4addrs.src = fl4->saddr; - keys->addrs.v4addrs.dst = fl4->daddr; - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys->ports.src = fl4->fl4_sport; - keys->ports.dst = fl4->fl4_dport; - keys->keyid.keyid = fl4->fl4_gre_key; - keys->basic.ip_proto = fl4->flowi4_proto; - - return flow_hash_from_keys(keys); -} -EXPORT_SYMBOL(__get_hash_from_flowi4); - static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, -- cgit v1.2.3-70-g09d2 From 88c060549a4c555d59965801d1e811b71614c2b7 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 1 Mar 2018 02:02:27 +0100 Subject: dsa: Pass the port to get_sset_count() By passing the port, we allow different ports to have different statistics. This is useful since some ports have SERDES interfaces with their own statistic counters. Signed-off-by: Andrew Lunn Tested-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 2 +- drivers/net/dsa/b53/b53_priv.h | 2 +- drivers/net/dsa/dsa_loop.c | 2 +- drivers/net/dsa/lan9303-core.c | 2 +- drivers/net/dsa/microchip/ksz_common.c | 2 +- drivers/net/dsa/mt7530.c | 2 +- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- drivers/net/dsa/qca8k.c | 2 +- include/net/dsa.h | 2 +- net/dsa/master.c | 4 ++-- net/dsa/slave.c | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index db830a1141d9..cd16067265dd 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -852,7 +852,7 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data) } EXPORT_SYMBOL(b53_get_ethtool_stats); -int b53_get_sset_count(struct dsa_switch *ds) +int b53_get_sset_count(struct dsa_switch *ds, int port) { struct b53_device *dev = ds->priv; diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index d954cf36ecd8..1187ebd79287 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -288,7 +288,7 @@ void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port); int b53_configure_vlan(struct dsa_switch *ds); void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data); void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data); -int b53_get_sset_count(struct dsa_switch *ds); +int b53_get_sset_count(struct dsa_switch *ds, int port); int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge); void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state); diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index 7aa84ee4e771..f77be9f85cb3 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -86,7 +86,7 @@ static int dsa_loop_setup(struct dsa_switch *ds) return 0; } -static int dsa_loop_get_sset_count(struct dsa_switch *ds) +static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port) { return __DSA_LOOP_CNT_MAX; } diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index 6171c0853ff1..fefa454f3e56 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -1007,7 +1007,7 @@ static void lan9303_get_ethtool_stats(struct dsa_switch *ds, int port, } } -static int lan9303_get_sset_count(struct dsa_switch *ds) +static int lan9303_get_sset_count(struct dsa_switch *ds, int port) { return ARRAY_SIZE(lan9303_mib); } diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 663b0d5b982b..bcb3e6c734f2 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -439,7 +439,7 @@ static void ksz_disable_port(struct dsa_switch *ds, int port, ksz_port_cfg(dev, port, REG_PORT_CTRL_0, PORT_MAC_LOOPBACK, true); } -static int ksz_sset_count(struct dsa_switch *ds) +static int ksz_sset_count(struct dsa_switch *ds, int port) { return TOTAL_SWITCH_COUNTER_NUM; } diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 8a0bb000d056..511ca134f13f 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -604,7 +604,7 @@ mt7530_get_ethtool_stats(struct dsa_switch *ds, int port, } static int -mt7530_get_sset_count(struct dsa_switch *ds) +mt7530_get_sset_count(struct dsa_switch *ds, int port) { return ARRAY_SIZE(mt7530_mib); } diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 24486f96dd39..8c9a30d1b06f 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -754,7 +754,7 @@ static int mv88e6320_stats_get_sset_count(struct mv88e6xxx_chip *chip) STATS_TYPE_BANK1); } -static int mv88e6xxx_get_sset_count(struct dsa_switch *ds) +static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port) { struct mv88e6xxx_chip *chip = ds->priv; diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c index 9df22ebee822..600d5ad1fbde 100644 --- a/drivers/net/dsa/qca8k.c +++ b/drivers/net/dsa/qca8k.c @@ -631,7 +631,7 @@ qca8k_get_ethtool_stats(struct dsa_switch *ds, int port, } static int -qca8k_get_sset_count(struct dsa_switch *ds) +qca8k_get_sset_count(struct dsa_switch *ds, int port) { return ARRAY_SIZE(ar8327_mib); } diff --git a/include/net/dsa.h b/include/net/dsa.h index 0ad17b63684d..60fb4ec8ba61 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -359,7 +359,7 @@ struct dsa_switch_ops { void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); - int (*get_sset_count)(struct dsa_switch *ds); + int (*get_sset_count)(struct dsa_switch *ds, int port); /* * ethtool Wake-on-LAN diff --git a/net/dsa/master.c b/net/dsa/master.c index 00589147f042..90e6df0351eb 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -42,7 +42,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset) count += ops->get_sset_count(dev, sset); if (sset == ETH_SS_STATS && ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); + count += ds->ops->get_sset_count(ds, cpu_dp->index); return count; } @@ -76,7 +76,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset, * constructed earlier */ ds->ops->get_strings(ds, port, ndata); - count = ds->ops->get_sset_count(ds); + count = ds->ops->get_sset_count(ds, port); for (i = 0; i < count; i++) { memmove(ndata + (i * len + sizeof(pfx)), ndata + i * len, len - sizeof(pfx)); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3376dad6dcfd..18561af7a8f1 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -605,7 +605,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) count = 4; if (ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds); + count += ds->ops->get_sset_count(ds, dp->index); return count; } -- cgit v1.2.3-70-g09d2 From 6b2536039f653e35c96e7461a7456d2246331462 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sun, 4 Mar 2018 21:02:18 +0100 Subject: batman-adv: Avoid redundant multicast TT entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a node signals that it wants all traffic for a specific protocol family then there is no need to announce individual multicast addresses via TT. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/multicast.c | 56 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 6eaffe50335a..c7a1305ca7e7 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -101,8 +101,37 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) return upper; } +/** + * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4 + * @addr: the MAC address to check + * + * Return: True, if MAC address is one reserved for IPv4 multicast, false + * otherwise. + */ +static bool batadv_mcast_addr_is_ipv4(const u8 *addr) +{ + static const u8 prefix[] = {0x01, 0x00, 0x5E}; + + return memcmp(prefix, addr, sizeof(prefix)) == 0; +} + +/** + * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6 + * @addr: the MAC address to check + * + * Return: True, if MAC address is one reserved for IPv6 multicast, false + * otherwise. + */ +static bool batadv_mcast_addr_is_ipv6(const u8 *addr) +{ + static const u8 prefix[] = {0x33, 0x33}; + + return memcmp(prefix, addr, sizeof(prefix)) == 0; +} + /** * batadv_mcast_mla_softif_get() - get softif multicast listeners + * @bat_priv: the bat priv with all the soft interface information * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -119,9 +148,12 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_softif_get(struct net_device *dev, +static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv, + struct net_device *dev, struct hlist_head *mcast_list) { + bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; + bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct net_device *bridge = batadv_mcast_get_bridge(dev); struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; @@ -129,6 +161,12 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, netif_addr_lock_bh(bridge ? bridge : dev); netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { + if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr)) + continue; + + if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr)) + continue; + new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; @@ -193,6 +231,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners + * @bat_priv: the bat priv with all the soft interface information * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @@ -204,10 +243,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ -static int batadv_mcast_mla_bridge_get(struct net_device *dev, +static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv, + struct net_device *dev, struct hlist_head *mcast_list) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); + bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4; + bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6; struct br_ip_list *br_ip_entry, *tmp; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; @@ -221,6 +263,12 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev, goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { + if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP)) + continue; + + if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6)) + continue; + batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; @@ -568,11 +616,11 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) if (!batadv_mcast_mla_tvlv_update(bat_priv)) goto update; - ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list); + ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list); if (ret < 0) goto out; - ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list); + ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list); if (ret < 0) goto out; -- cgit v1.2.3-70-g09d2 From 77a5196a804e34ce5e215ef84d5e1de332e9c529 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 1 Mar 2018 13:49:57 -0800 Subject: gre: add sequence number for collect md mode. Currently GRE sequence number can only be used in native tunnel mode. This patch adds sequence number support for gre collect metadata mode. RFC2890 defines GRE sequence number to be specific to the traffic flow identified by the key. However, this patch does not implement per-key seqno. The sequence number is shared in the same tunnel device. That is, different tunnel keys using the same collect_md tunnel share single sequence number. Signed-off-by: William Tu Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 4 +++- net/ipv4/ip_gre.c | 7 +++++-- net/ipv6/ip6_gre.c | 13 ++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index db6bdc375126..2a66769e5875 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -800,6 +800,7 @@ enum bpf_func_id { /* BPF_FUNC_skb_set_tunnel_key flags. */ #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) +#define BPF_F_SEQ_NUMBER (1ULL << 3) /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. diff --git a/net/core/filter.c b/net/core/filter.c index 0c121adbdbaa..33edfa8372fd 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2991,7 +2991,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | - BPF_F_DONT_FRAGMENT))) + BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { @@ -3025,6 +3025,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; if (flags & BPF_F_ZERO_CSUM_TX) info->key.tun_flags &= ~TUNNEL_CSUM; + if (flags & BPF_F_SEQ_NUMBER) + info->key.tun_flags |= TUNNEL_SEQ; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 0fe1d69b5df4..95fd225f402e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -522,6 +522,7 @@ err_free_skb: static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, __be16 proto) { + struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; struct rtable *rt = NULL; @@ -545,9 +546,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = tun_info->key.tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 83c7766c8c75..18a3dfbd0300 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -695,9 +695,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, else fl6->daddr = tunnel->parms.raddr; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - /* Push GRE header. */ protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; @@ -720,14 +717,20 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = key->tos; - flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + flags = key->tun_flags & + (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); tunnel->tun_hlen = gre_calc_hlen(flags); gre_build_header(skb, tunnel->tun_hlen, flags, protocol, - tunnel_id_to_key32(tun_info->key.tun_id), 0); + tunnel_id_to_key32(tun_info->key.tun_id), + (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + : 0); } else { + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); -- cgit v1.2.3-70-g09d2 From d143b9e3055358e160d8d5975d889ce5c149625a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Fri, 2 Mar 2018 20:52:01 -0500 Subject: net sched actions: corrected extack message Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 1f65d6ada9ff..a54fa7b8c217 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1083,7 +1083,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0) <= 0) { - NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while deleting TC action"); + NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } -- cgit v1.2.3-70-g09d2 From e6c6a92905210484a84a0cae5b013570b7a67b6f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 4 Mar 2018 14:12:04 +0200 Subject: net: Make RX-FCS and LRO mutually exclusive LRO and RX-FCS offloads cannot be enabled at the same time since it is not clear what should happen to the FCS of each coalesced packet. The FCS is not really part of the TCP payload, hence cannot be merged into one big packet. On the other hand, providing one big LRO packet with one FCS contradicts the RX-FCS feature goal. Use the fix features mechanism in order to prevent intersection of the features and drop LRO in case RX-FCS is requested. Enabling RX-FCS while LRO is enabled will result in: $ ethtool -K ens6 rx-fcs on Actual changes: large-receive-offload: off [requested on] rx-fcs: on Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 40fb3aed5df2..8b51f923ce99 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7542,6 +7542,12 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + /* LRO feature cannot be combined with RX-FCS */ + if ((features & NETIF_F_LRO) && (features & NETIF_F_RXFCS)) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; + } + return features; } -- cgit v1.2.3-70-g09d2 From 87ecc95d81d951b0984f2eb9c5c118cb68d0dce8 Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:35 -0800 Subject: tcp: add send queue size stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SENDQ_SIZE stat into SCM_TIMESTAMPING_OPT_STATS. It reports no. of bytes present in send queue, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index b4a4f64635fa..93bad2128ef6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -241,6 +241,7 @@ enum { TCP_NLA_MIN_RTT, /* minimum RTT */ TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a33539798bf6..162ba4227446 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 3 * nla_total_size(sizeof(u32)) + + 4 * nla_total_size(sizeof(u32)) + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3061,6 +3061,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + + nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); return stats; } -- cgit v1.2.3-70-g09d2 From be631892948060f44b1ceee3132be1266932071e Mon Sep 17 00:00:00 2001 From: Priyaranjan Jha Date: Sun, 4 Mar 2018 10:38:36 -0800 Subject: tcp: add ca_state stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_CA_STATE stat into SCM_TIMESTAMPING_OPT_STATS. It reports ca_state of socket, when timestamp is generated. Signed-off-by: Priyaranjan Jha Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 93bad2128ef6..4c0ae0faf7ca 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -242,6 +242,7 @@ enum { TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ + TCP_NLA_CA_STATE, /* ca_state of socket */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 162ba4227446..fb350f740f69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3032,7 +3032,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 4 * nla_total_size(sizeof(u32)) + - 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3063,6 +3063,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); + nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); return stats; } -- cgit v1.2.3-70-g09d2 From 955dc68cb9b23b42999cafe6df3684309bc686c6 Mon Sep 17 00:00:00 2001 From: Samuel Mendoza-Jonas Date: Mon, 5 Mar 2018 11:39:05 +1100 Subject: net/ncsi: Add generic netlink family Add a generic netlink family for NCSI. This supports three commands; NCSI_CMD_PKG_INFO which returns information on packages and their associated channels, NCSI_CMD_SET_INTERFACE which allows a specific package or package/channel combination to be set as the preferred choice, and NCSI_CMD_CLEAR_INTERFACE which clears any preferred setting. Signed-off-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/uapi/linux/ncsi.h | 115 +++++++++++++ net/ncsi/Makefile | 2 +- net/ncsi/internal.h | 3 + net/ncsi/ncsi-manage.c | 30 +++- net/ncsi/ncsi-netlink.c | 421 ++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-netlink.h | 20 +++ 6 files changed, 586 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/ncsi.h create mode 100644 net/ncsi/ncsi-netlink.c create mode 100644 net/ncsi/ncsi-netlink.h (limited to 'net') diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h new file mode 100644 index 000000000000..4c292ecbb748 --- /dev/null +++ b/include/uapi/linux/ncsi.h @@ -0,0 +1,115 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __UAPI_NCSI_NETLINK_H__ +#define __UAPI_NCSI_NETLINK_H__ + +/** + * enum ncsi_nl_commands - supported NCSI commands + * + * @NCSI_CMD_UNSPEC: unspecified command to catch errors + * @NCSI_CMD_PKG_INFO: list package and channel attributes. Requires + * NCSI_ATTR_IFINDEX. If NCSI_ATTR_PACKAGE_ID is specified returns the + * specific package and its channels - otherwise a dump request returns + * all packages and their associated channels. + * @NCSI_CMD_SET_INTERFACE: set preferred package and channel combination. + * Requires NCSI_ATTR_IFINDEX and the preferred NCSI_ATTR_PACKAGE_ID and + * optionally the preferred NCSI_ATTR_CHANNEL_ID. + * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination. + * Requires NCSI_ATTR_IFINDEX. + * @NCSI_CMD_MAX: highest command number + */ +enum ncsi_nl_commands { + NCSI_CMD_UNSPEC, + NCSI_CMD_PKG_INFO, + NCSI_CMD_SET_INTERFACE, + NCSI_CMD_CLEAR_INTERFACE, + + __NCSI_CMD_AFTER_LAST, + NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_attrs - General NCSI netlink attributes + * + * @NCSI_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_ATTR_IFINDEX: ifindex of network device using NCSI + * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes + * @NCSI_ATTR_PACKAGE_ID: package ID + * @NCSI_ATTR_CHANNEL_ID: channel ID + * @NCSI_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_attrs { + NCSI_ATTR_UNSPEC, + NCSI_ATTR_IFINDEX, + NCSI_ATTR_PACKAGE_LIST, + NCSI_ATTR_PACKAGE_ID, + NCSI_ATTR_CHANNEL_ID, + + __NCSI_ATTR_AFTER_LAST, + NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_pkg_attrs - NCSI netlink package-specific attributes + * + * @NCSI_PKG_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_PKG_ATTR: nested array of package attributes + * @NCSI_PKG_ATTR_ID: package ID + * @NCSI_PKG_ATTR_FORCED: flag signifying a package has been set as preferred + * @NCSI_PKG_ATTR_CHANNEL_LIST: nested array of NCSI_CHANNEL_ATTR attributes + * @NCSI_PKG_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_pkg_attrs { + NCSI_PKG_ATTR_UNSPEC, + NCSI_PKG_ATTR, + NCSI_PKG_ATTR_ID, + NCSI_PKG_ATTR_FORCED, + NCSI_PKG_ATTR_CHANNEL_LIST, + + __NCSI_PKG_ATTR_AFTER_LAST, + NCSI_PKG_ATTR_MAX = __NCSI_PKG_ATTR_AFTER_LAST - 1 +}; + +/** + * enum ncsi_nl_channel_attrs - NCSI netlink channel-specific attributes + * + * @NCSI_CHANNEL_ATTR_UNSPEC: unspecified attributes to catch errors + * @NCSI_CHANNEL_ATTR: nested array of channel attributes + * @NCSI_CHANNEL_ATTR_ID: channel ID + * @NCSI_CHANNEL_ATTR_VERSION_MAJOR: channel major version number + * @NCSI_CHANNEL_ATTR_VERSION_MINOR: channel minor version number + * @NCSI_CHANNEL_ATTR_VERSION_STR: channel version string + * @NCSI_CHANNEL_ATTR_LINK_STATE: channel link state flags + * @NCSI_CHANNEL_ATTR_ACTIVE: channels with this flag are in + * NCSI_CHANNEL_ACTIVE state + * @NCSI_CHANNEL_ATTR_FORCED: flag signifying a channel has been set as + * preferred + * @NCSI_CHANNEL_ATTR_VLAN_LIST: nested array of NCSI_CHANNEL_ATTR_VLAN_IDs + * @NCSI_CHANNEL_ATTR_VLAN_ID: VLAN ID being filtered on this channel + * @NCSI_CHANNEL_ATTR_MAX: highest attribute number + */ +enum ncsi_nl_channel_attrs { + NCSI_CHANNEL_ATTR_UNSPEC, + NCSI_CHANNEL_ATTR, + NCSI_CHANNEL_ATTR_ID, + NCSI_CHANNEL_ATTR_VERSION_MAJOR, + NCSI_CHANNEL_ATTR_VERSION_MINOR, + NCSI_CHANNEL_ATTR_VERSION_STR, + NCSI_CHANNEL_ATTR_LINK_STATE, + NCSI_CHANNEL_ATTR_ACTIVE, + NCSI_CHANNEL_ATTR_FORCED, + NCSI_CHANNEL_ATTR_VLAN_LIST, + NCSI_CHANNEL_ATTR_VLAN_ID, + + __NCSI_CHANNEL_ATTR_AFTER_LAST, + NCSI_CHANNEL_ATTR_MAX = __NCSI_CHANNEL_ATTR_AFTER_LAST - 1 +}; + +#endif /* __UAPI_NCSI_NETLINK_H__ */ diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile index dd12b564f2e7..436ef68331f2 100644 --- a/net/ncsi/Makefile +++ b/net/ncsi/Makefile @@ -1,4 +1,4 @@ # # Makefile for NCSI API # -obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o +obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o ncsi-netlink.o diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index d30f7bd741d0..8da84312cd3b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -276,6 +276,8 @@ struct ncsi_dev_priv { unsigned int package_num; /* Number of packages */ struct list_head packages; /* List of packages */ struct ncsi_channel *hot_channel; /* Channel was ever active */ + struct ncsi_package *force_package; /* Force a specific package */ + struct ncsi_channel *force_channel; /* Force a specific channel */ struct ncsi_request requests[256]; /* Request table */ unsigned int request_id; /* Last used request ID */ #define NCSI_REQ_START_IDX 1 @@ -318,6 +320,7 @@ extern spinlock_t ncsi_dev_lock; list_for_each_entry_rcu(nc, &np->channels, node) /* Resources */ +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index); int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data); int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index c989211bbabc..c3695ba0cf94 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include "internal.h" #include "ncsi-pkt.h" +#include "ncsi-netlink.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); @@ -38,7 +38,7 @@ static inline int ncsi_filter_size(int table) return sizes[table]; } -static u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) +u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index) { struct ncsi_channel_filter *ncf; int size; @@ -965,20 +965,37 @@ error: static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { - struct ncsi_package *np; - struct ncsi_channel *nc, *found, *hot_nc; + struct ncsi_package *np, *force_package; + struct ncsi_channel *nc, *found, *hot_nc, *force_channel; struct ncsi_channel_mode *ncm; unsigned long flags; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; + force_channel = ndp->force_channel; + force_package = ndp->force_package; spin_unlock_irqrestore(&ndp->lock, flags); + /* Force a specific channel whether or not it has link if we have been + * configured to do so + */ + if (force_package && force_channel) { + found = force_channel; + ncm = &found->modes[NCSI_MODE_LINK]; + if (!(ncm->data[2] & 0x1)) + netdev_info(ndp->ndev.dev, + "NCSI: Channel %u forced, but it is link down\n", + found->id); + goto out; + } + /* The search is done once an inactive channel with up * link is found. */ found = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (ndp->force_package && np != ndp->force_package) + continue; NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); @@ -1594,6 +1611,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev, ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); + /* Set up generic netlink interface */ + ncsi_init_netlink(dev); + return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); @@ -1673,6 +1693,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) #endif spin_unlock_irqrestore(&ncsi_dev_lock, flags); + ncsi_unregister_netlink(nd->dev); + kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c new file mode 100644 index 000000000000..d4201665a580 --- /dev/null +++ b/net/ncsi/ncsi-netlink.c @@ -0,0 +1,421 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "ncsi-netlink.h" + +static struct genl_family ncsi_genl_family; + +static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { + [NCSI_ATTR_IFINDEX] = { .type = NLA_U32 }, + [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, + [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, + [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, +}; + +static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) +{ + struct ncsi_dev_priv *ndp; + struct net_device *dev; + struct ncsi_dev *nd; + struct ncsi_dev; + + if (!net) + return NULL; + + dev = dev_get_by_index(net, ifindex); + if (!dev) { + pr_err("NCSI netlink: No device for ifindex %u\n", ifindex); + return NULL; + } + + nd = ncsi_find_dev(dev); + ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; + + dev_put(dev); + return ndp; +} + +static int ncsi_write_channel_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, + struct ncsi_channel *nc) +{ + struct nlattr *vid_nest; + struct ncsi_channel_filter *ncf; + struct ncsi_channel_mode *m; + u32 *data; + int i; + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); + m = &nc->modes[NCSI_MODE_LINK]; + nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); + if (nc->state == NCSI_CHANNEL_ACTIVE) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); + if (ndp->force_channel == nc) + nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); + + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version); + nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2); + nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); + + vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); + if (!vid_nest) + return -ENOMEM; + ncf = nc->filters[NCSI_FILTER_VLAN]; + i = -1; + if (ncf) { + while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total, + i + 1)) < ncf->total) { + data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i); + /* Uninitialised channels will have 'zero' vlan ids */ + if (!data || !*data) + continue; + nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, + *(u16 *)data); + } + } + nla_nest_end(skb, vid_nest); + + return 0; +} + +static int ncsi_write_package_info(struct sk_buff *skb, + struct ncsi_dev_priv *ndp, unsigned int id) +{ + struct nlattr *pnest, *cnest, *nest; + struct ncsi_package *np; + struct ncsi_channel *nc; + bool found; + int rc; + + if (id > ndp->package_num) { + netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); + return -ENODEV; + } + + found = false; + NCSI_FOR_EACH_PACKAGE(ndp, np) { + if (np->id != id) + continue; + pnest = nla_nest_start(skb, NCSI_PKG_ATTR); + if (!pnest) + return -ENOMEM; + nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); + if (ndp->force_package == np) + nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); + cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST); + if (!cnest) { + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + NCSI_FOR_EACH_CHANNEL(np, nc) { + nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR); + if (!nest) { + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return -ENOMEM; + } + rc = ncsi_write_channel_info(skb, ndp, nc); + if (rc) { + nla_nest_cancel(skb, nest); + nla_nest_cancel(skb, cnest); + nla_nest_cancel(skb, pnest); + return rc; + } + nla_nest_end(skb, nest); + } + nla_nest_end(skb, cnest); + nla_nest_end(skb, pnest); + found = true; + } + + if (!found) + return -ENODEV; + + return 0; +} + +static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct sk_buff *skb; + struct nlattr *attr; + void *hdr; + int rc; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(genl_info_net(info), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + kfree(skb); + return -EMSGSIZE; + } + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + rc = ncsi_write_package_info(skb, ndp, package_id); + + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, info); + +err: + genlmsg_cancel(skb, hdr); + kfree(skb); + return rc; +} + +static int ncsi_pkg_info_all_nl(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *attrs[NCSI_ATTR_MAX]; + struct ncsi_package *np, *package; + struct ncsi_dev_priv *ndp; + unsigned int package_id; + struct nlattr *attr; + void *hdr; + int rc; + + rc = genlmsg_parse(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, + ncsi_genl_policy, NULL); + if (rc) + return rc; + + if (!attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)), + nla_get_u32(attrs[NCSI_ATTR_IFINDEX])); + + if (!ndp) + return -ENODEV; + + package_id = cb->args[0]; + package = NULL; + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + + if (!package) + return 0; /* done */ + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); + if (!hdr) { + rc = -EMSGSIZE; + goto err; + } + + attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + rc = ncsi_write_package_info(skb, ndp, package->id); + if (rc) { + nla_nest_cancel(skb, attr); + goto err; + } + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + cb->args[0] = package_id + 1; + + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return rc; +} + +static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_package *np, *package; + struct ncsi_channel *nc, *channel; + u32 package_id, channel_id; + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + package = NULL; + + spin_lock_irqsave(&ndp->lock, flags); + + NCSI_FOR_EACH_PACKAGE(ndp, np) + if (np->id == package_id) + package = np; + if (!package) { + /* The user has set a package that does not exist */ + return -ERANGE; + } + + channel = NULL; + if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { + /* Allow any channel */ + channel_id = NCSI_RESERVED_CHANNEL; + } else { + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + NCSI_FOR_EACH_CHANNEL(package, nc) + if (nc->id == channel_id) + channel = nc; + } + + if (channel_id != NCSI_RESERVED_CHANNEL && !channel) { + /* The user has set a channel that does not exist on this + * package + */ + netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", + channel_id); + return -ERANGE; + } + + ndp->force_package = package; + ndp->force_channel = channel; + spin_unlock_irqrestore(&ndp->lock, flags); + + netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n", + package_id, channel_id, + channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : ""); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + unsigned long flags; + + if (!info || !info->attrs) + return -EINVAL; + + if (!info->attrs[NCSI_ATTR_IFINDEX]) + return -EINVAL; + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) + return -ENODEV; + + /* Clear any override */ + spin_lock_irqsave(&ndp->lock, flags); + ndp->force_package = NULL; + ndp->force_channel = NULL; + spin_unlock_irqrestore(&ndp->lock, flags); + netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); + + /* Bounce the NCSI channel to set changes */ + ncsi_stop_dev(&ndp->ndev); + ncsi_start_dev(&ndp->ndev); + + return 0; +} + +static const struct genl_ops ncsi_ops[] = { + { + .cmd = NCSI_CMD_PKG_INFO, + .policy = ncsi_genl_policy, + .doit = ncsi_pkg_info_nl, + .dumpit = ncsi_pkg_info_all_nl, + .flags = 0, + }, + { + .cmd = NCSI_CMD_SET_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_set_interface_nl, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = NCSI_CMD_CLEAR_INTERFACE, + .policy = ncsi_genl_policy, + .doit = ncsi_clear_interface_nl, + .flags = GENL_ADMIN_PERM, + }, +}; + +static struct genl_family ncsi_genl_family __ro_after_init = { + .name = "NCSI", + .version = 0, + .maxattr = NCSI_ATTR_MAX, + .module = THIS_MODULE, + .ops = ncsi_ops, + .n_ops = ARRAY_SIZE(ncsi_ops), +}; + +int ncsi_init_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_register_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to register netlink family\n"); + + return rc; +} + +int ncsi_unregister_netlink(struct net_device *dev) +{ + int rc; + + rc = genl_unregister_family(&ncsi_genl_family); + if (rc) + netdev_err(dev, "ncsi: failed to unregister netlink family\n"); + + return rc; +} diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h new file mode 100644 index 000000000000..91a5c256f8c4 --- /dev/null +++ b/net/ncsi/ncsi-netlink.h @@ -0,0 +1,20 @@ +/* + * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NCSI_NETLINK_H__ +#define __NCSI_NETLINK_H__ + +#include + +#include "internal.h" + +int ncsi_init_netlink(struct net_device *dev); +int ncsi_unregister_netlink(struct net_device *dev); + +#endif /* __NCSI_NETLINK_H__ */ -- cgit v1.2.3-70-g09d2 From ec012f3b85159102662264bb5c3436c849d2b1b9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:30:41 +0300 Subject: net: Convert broute_net_ops, frame_filter_net_ops and frame_nat_net_ops These pernet_operations use ebt_register_table() and ebt_unregister_table() to act on the tables, which are used as argument in ebt_do_table(), called from ebtables hooks. Since there are no net-related bridge packets in-flight, when the init and exit methods are called, these pernet_operations are safe to be executed in parallel with any other. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/bridge/netfilter/ebtable_broute.c | 1 + net/bridge/netfilter/ebtable_filter.c | 1 + net/bridge/netfilter/ebtable_nat.c | 1 + 3 files changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 276b60262981..f070b5e5b9dd 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -77,6 +77,7 @@ static void __net_exit broute_net_exit(struct net *net) static struct pernet_operations broute_net_ops = { .init = broute_net_init, .exit = broute_net_exit, + .async = true, }; static int __init ebtable_broute_init(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index c41da5fac84f..4151afc8efcc 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -105,6 +105,7 @@ static void __net_exit frame_filter_net_exit(struct net *net) static struct pernet_operations frame_filter_net_ops = { .init = frame_filter_net_init, .exit = frame_filter_net_exit, + .async = true, }; static int __init ebtable_filter_init(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 08df7406ecb3..b8da2dfe2ec5 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -105,6 +105,7 @@ static void __net_exit frame_nat_net_exit(struct net *net) static struct pernet_operations frame_nat_net_ops = { .init = frame_nat_net_init, .exit = frame_nat_net_exit, + .async = true, }; static int __init ebtable_nat_init(void) -- cgit v1.2.3-70-g09d2 From 3822034569ac8ae39556e50fa165dc7d4276f452 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:30:50 +0300 Subject: net: Convert log pernet_operations These pernet_operations use nf_log_set() and nf_log_unset() in their methods: nf_log_bridge_net_ops nf_log_arp_net_ops nf_log_ipv4_net_ops nf_log_ipv6_net_ops nf_log_netdev_net_ops Nobody can send such a packet to a net before it's became registered, nobody can send a packet after all netdevices are unregistered. So, these pernet_operations are able to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/bridge/netfilter/nf_log_bridge.c | 1 + net/ipv4/netfilter/nf_log_arp.c | 1 + net/ipv4/netfilter/nf_log_ipv4.c | 1 + net/ipv6/netfilter/nf_log_ipv6.c | 1 + net/netfilter/nf_log_netdev.c | 1 + 5 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index bd2b3c78f59b..91bfc2ac055a 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -48,6 +48,7 @@ static void __net_exit nf_log_bridge_net_exit(struct net *net) static struct pernet_operations nf_log_bridge_net_ops = { .init = nf_log_bridge_net_init, .exit = nf_log_bridge_net_exit, + .async = true, }; static int __init nf_log_bridge_init(void) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index df5c2a2061a4..162293469ac2 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -122,6 +122,7 @@ static void __net_exit nf_log_arp_net_exit(struct net *net) static struct pernet_operations nf_log_arp_net_ops = { .init = nf_log_arp_net_init, .exit = nf_log_arp_net_exit, + .async = true, }; static int __init nf_log_arp_init(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 4388de0e5380..7a06de140f3c 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -358,6 +358,7 @@ static void __net_exit nf_log_ipv4_net_exit(struct net *net) static struct pernet_operations nf_log_ipv4_net_ops = { .init = nf_log_ipv4_net_init, .exit = nf_log_ipv4_net_exit, + .async = true, }; static int __init nf_log_ipv4_init(void) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index b397a8fe88b9..0220e584589c 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -390,6 +390,7 @@ static void __net_exit nf_log_ipv6_net_exit(struct net *net) static struct pernet_operations nf_log_ipv6_net_ops = { .init = nf_log_ipv6_net_init, .exit = nf_log_ipv6_net_exit, + .async = true, }; static int __init nf_log_ipv6_init(void) diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c index 350eb147754d..254c2c6bde48 100644 --- a/net/netfilter/nf_log_netdev.c +++ b/net/netfilter/nf_log_netdev.c @@ -47,6 +47,7 @@ static void __net_exit nf_log_netdev_net_exit(struct net *net) static struct pernet_operations nf_log_netdev_net_ops = { .init = nf_log_netdev_net_init, .exit = nf_log_netdev_net_exit, + .async = true, }; static int __init nf_log_netdev_init(void) -- cgit v1.2.3-70-g09d2 From c60a246cd366b5da5a12cc972a25e41bc536706f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:00 +0300 Subject: net: Convert arp_tables_net_ops and ip6_tables_net_ops These pernet_operations call xt_proto_init() and xt_proto_fini(), which just register and unregister /proc entries. They are safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e3e420f3ba7b..c36ffce3c812 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1635,6 +1635,7 @@ static void __net_exit arp_tables_net_exit(struct net *net) static struct pernet_operations arp_tables_net_ops = { .init = arp_tables_net_init, .exit = arp_tables_net_exit, + .async = true, }; static int __init arp_tables_init(void) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 62358b93bbac..4de8ac1e5af4 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1928,6 +1928,7 @@ static void __net_exit ip6_tables_net_exit(struct net *net) static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, + .async = true, }; static int __init ip6_tables_init(void) -- cgit v1.2.3-70-g09d2 From d217472410a033cdef5b8fba4393b466ee3c3bbc Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:10 +0300 Subject: net: Convert caif_net_ops Init method just allocates memory for new cfg, and assigns net_generic(net, caif_net_id). Despite there is synchronize_rcu() on error path in cfcnfg_create(), in real this function does not use global lists, so it looks like this synchronize_rcu() is some legacy inheritance. Exit method removes caif devices under rtnl_lock(). There could be a problem, if someone from foreign net pernet_operations dereference caif_net_id of this net. It's dereferenced in get_cfcnfg() and caif_device_list(). get_cfcnfg() is used from netdevice notifiers, where they are called under rtnl_lock(). The notifiers can't be called from foreign nets pernet_operations. Also, it's used from caif_disconnect_client() and from caif_connect_client(). The both of the functions work with caif socket, and there is the only possibility to have a socket, when the net is dead. This may happen only of the socket was created as kern using sk_alloc(). Grep by PF_CAIF shows we do not create kern caif sockets, so get_cfcnfg() is safe. caif_device_list() is used in netdevice notifiers and exit method under rtnl lock. Also, from caif_get() used in the netdev notifiers and in caif_flow_cb(). The last item is skb destructor. Since there are no kernel caif sockets nobody can send net a packet in parallel with init/exit, so this is also safe. So, these pernet_operations are safe to be async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index e0adcd123f48..7a78268cc572 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -544,6 +544,7 @@ static struct pernet_operations caif_net_ops = { .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), + .async = true, }; /* Initialize Caif devices list */ -- cgit v1.2.3-70-g09d2 From 111da7adc1276d44d20c3973f4b39d62f83df056 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:19 +0300 Subject: net: Convert cangw_pernet_ops These pernet_operations have a deal with cgw_list, and the rest of accesses are made under rtnl_lock(). The only exception is cgw_dump_jobs(), which is accessed under rcu_read_lock(). cgw_dump_jobs() is called on netlink request, and it does not seem, foreign pernet_operations want to send a net such the messages. So, we mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/can/gw.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/gw.c b/net/can/gw.c index 398dd0395ad9..08e97668d5cf 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1010,6 +1010,7 @@ static void __net_exit cangw_pernet_exit(struct net *net) static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit = cangw_pernet_exit, + .async = true, }; static __init int cgw_module_init(void) -- cgit v1.2.3-70-g09d2 From 5368bd72cd7aacd0e3f80f61e9b2a096fc81cc10 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:28 +0300 Subject: net: Convert dccp_v4_ops These pernet_operations create and destroy net::dccp::v4_ctl_sk. It looks like another pernet_operations don't want to send dccp packets to dying or creating net. Batch method similar to ipv4/ipv6 sockets and it has to be safe to be executed in parallel with anything else. So, we mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e65fcb45c3f6..13ad28ab1e79 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1031,6 +1031,7 @@ static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, + .async = true, }; static int __init dccp_v4_init(void) -- cgit v1.2.3-70-g09d2 From 16b0c0c4d9a799979b0d3f387821c93bcc301f79 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:37 +0300 Subject: net: Convert dccp_v6_ops These pernet_operations looks similar to dccp_v4_ops, and they are also safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5df7857fc0f3..2f48c020f8c3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1116,6 +1116,7 @@ static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, + .async = true, }; static int __init dccp_v6_init(void) -- cgit v1.2.3-70-g09d2 From 6c6c566e6dd80bec6f5bbb9c36e397ed9b109cdb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:47 +0300 Subject: net: Convert fou_net_ops These pernet_operations initialize and destroy pernet net_generic(net, fou_net_id) list. The rest of net_generic(net, fou_net_id) accesses may happen after netlink message, and in-tree pernet_operations do not send FOU_GENL_NAME messages. So, these pernet_operations are safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/fou.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..d3e1a9af478b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1081,6 +1081,7 @@ static struct pernet_operations fou_net_ops = { .exit = fou_exit_net, .id = &fou_net_id, .size = sizeof(struct fou_net), + .async = true, }; static int __init fou_init(void) -- cgit v1.2.3-70-g09d2 From a5a179b6dff15ae0b89cb601b0a0242cff60b8e3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:31:55 +0300 Subject: net: Convert ip_set_net_ops These pernet_operations initialize and destroy net_generic(net, ip_set_net_id)-related data. Since ip_set is under CONFIG_IP_SET, it's easy to watch drivers, which depend on this config. All of them are in net/netfilter/ipset directory, except of net/netfilter/xt_set.c. There are no more drivers, which use ip_set, and all of the above don't register another pernet_operations. Also, there are is no indirect users, as header file include/linux/netfilter/ipset/ip_set.h does not define indirect users by something like this: #ifdef CONFIG_IP_SET extern func(void); #else static inline func(void); #endif So, there are no more pernet operations, dereferencing net_generic(net, ip_set_net_id). ip_set_net_ops are OK to be executed in parallel for several net, so we mark them as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/ipset/ip_set_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 975a85a48d39..2523ebe2b3cc 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2094,7 +2094,8 @@ static struct pernet_operations ip_set_net_ops = { .init = ip_set_net_init, .exit = ip_set_net_exit, .id = &ip_set_net_id, - .size = sizeof(struct ip_set_net) + .size = sizeof(struct ip_set_net), + .async = true, }; static int __init -- cgit v1.2.3-70-g09d2 From 467d14b3073960645902ea0d18c1cbe645013638 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:32:06 +0300 Subject: net: Convert nf_conntrack_net_ops These pernet_operations register and unregister sysctl and /proc entries. Exit batch method also waits till all per-net conntracks are dead. Thus, they are safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_standalone.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9123fdec5e14..3cdce391362e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -705,6 +705,7 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, + .async = true, }; static int __init nf_conntrack_standalone_init(void) -- cgit v1.2.3-70-g09d2 From b04a3d098c4ca176849ee579880c63c052ce6776 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:32:15 +0300 Subject: net: Convert ctnetlink_net_ops These pernet_operations register and unregister two conntrack notifiers, and they seem to be safe to be executed in parallel. General/not related to async pernet_operations JFI: ctnetlink_net_exit_batch() actions are grouped in batch, and this could look like there is synchronize_rcu() is forgotten. But there is synchronize_rcu() on module exit patch (in ctnetlink_exit()), so this batch may be reworked as simple .exit method. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dd177ebee9aa..8884d302d33a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3417,6 +3417,7 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .exit_batch = ctnetlink_net_exit_batch, + .async = true, }; static int __init ctnetlink_init(void) -- cgit v1.2.3-70-g09d2 From c29babb7fe26e1507e8c236abbfed6e944e42651 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 5 Mar 2018 14:32:23 +0300 Subject: net: Convert proto_gre_net_ops These pernet_operations register and unregister sysctl. nf_conntrack_l4proto_gre4->init_net is simple memory initializer. Also, exit method removes gre keymap_list, which is per-net. This looks safe to be executed in parallel with other pernet_operations. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index d049ea5a3770..9bcd72fe91f9 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -406,6 +406,7 @@ static struct pernet_operations proto_gre_net_ops = { .exit = proto_gre_net_exit, .id = &proto_gre_net_id, .size = sizeof(struct netns_proto_gre), + .async = true, }; static int __init nf_ct_proto_gre_init(void) -- cgit v1.2.3-70-g09d2 From 4c1342d967cb556ea1c0f34271b125deeb25f0f8 Mon Sep 17 00:00:00 2001 From: Jonathan Neuschäfer Date: Sun, 4 Mar 2018 03:29:52 +0100 Subject: net: core: dst_cache_set_ip6: Rename 'addr' parameter to 'saddr' for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The other dst_cache_{get,set}_ip{4,6} functions, and the doc comment for dst_cache_set_ip6 use 'saddr' for their source address parameter. Rename the parameter to increase consistency. This fixes the following kernel-doc warnings: ./include/net/dst_cache.h:58: warning: Function parameter or member 'addr' not described in 'dst_cache_set_ip6' ./include/net/dst_cache.h:58: warning: Excess function parameter 'saddr' description in 'dst_cache_set_ip6' Fixes: 911362c70df5 ("net: add dst_cache support") Signed-off-by: Jonathan Neuschäfer Signed-off-by: David S. Miller --- include/net/dst_cache.h | 2 +- net/core/dst_cache.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 844906fbf8c9..67634675e919 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -54,7 +54,7 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, * local BH must be disabled. */ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, - const struct in6_addr *addr); + const struct in6_addr *saddr); /** * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 554d36449231..64cef977484a 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -107,7 +107,7 @@ EXPORT_SYMBOL_GPL(dst_cache_set_ip4); #if IS_ENABLED(CONFIG_IPV6) void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, - const struct in6_addr *addr) + const struct in6_addr *saddr) { struct dst_cache_pcpu *idst; @@ -117,7 +117,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, rt6_get_cookie((struct rt6_info *)dst)); - idst->in6_saddr = *addr; + idst->in6_saddr = *saddr; } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); -- cgit v1.2.3-70-g09d2 From ae0662f84b105776734cb089703a7bf834bac195 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Sat, 20 Jan 2018 04:27:58 +0800 Subject: netfilter: nf_tables: nf_tables_obj_lookup_byhandle() can be static Fixes: 3ecbfd65f50e ("netfilter: nf_tables: allocate handle and delete objects via handle") Signed-off-by: Fengguang Wu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8b9fe30de0cd..8cc7fc970f0c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4328,9 +4328,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_obj_lookup); -struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, - const struct nlattr *nla, - u32 objtype, u8 genmask) +static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table, + const struct nlattr *nla, + u32 objtype, u8 genmask) { struct nft_object *obj; @@ -4850,7 +4850,7 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table, } EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup); -struct nft_flowtable * +static struct nft_flowtable * nf_tables_flowtable_lookup_byhandle(const struct nft_table *table, const struct nlattr *nla, u8 genmask) { -- cgit v1.2.3-70-g09d2 From cceae76ef3a1181242e4f7b559a7bfc904a9855c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 11 Feb 2018 19:17:20 +0900 Subject: netfilter: nfnetlink_acct: remove useless parameter parameter skb in nfnl_acct_overquota is not used anywhere. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_acct.h | 3 +-- net/netfilter/nfnetlink_acct.c | 3 +-- net/netfilter/xt_nfacct.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h index b4d741195c28..beee8bffe49e 100644 --- a/include/linux/netfilter/nfnetlink_acct.h +++ b/include/linux/netfilter/nfnetlink_acct.h @@ -16,6 +16,5 @@ struct nf_acct; struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name); void nfnl_acct_put(struct nf_acct *acct); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); -int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, - struct nf_acct *nfacct); +int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct); #endif /* _NFNL_ACCT_H */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 88d427f9f9e6..b9505bcd3827 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -467,8 +467,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) GFP_ATOMIC); } -int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, - struct nf_acct *nfacct) +int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) { u64 now; u64 *quota; diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index c8674deed4eb..6b56f4170860 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -28,7 +28,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par) nfnl_acct_update(skb, info->nfacct); - overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct); + overquota = nfnl_acct_overquota(xt_net(par), info->nfacct); return overquota == NFACCT_UNDERQUOTA ? false : true; } -- cgit v1.2.3-70-g09d2 From 580c7d9e4cc69802189b872ad2df0d704c649441 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 11 Feb 2018 22:57:29 +0900 Subject: netfilter: xt_cluster: get rid of xt_cluster_ipv6_is_multicast If use the ipv6_addr_is_multicast instead of xt_cluster_ipv6_is_multicast, then we can reduce code size. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_cluster.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index 0068688995c8..dfbdbb2fc0ed 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -59,13 +59,6 @@ xt_cluster_hash(const struct nf_conn *ct, return reciprocal_scale(hash, info->total_nodes); } -static inline bool -xt_cluster_ipv6_is_multicast(const struct in6_addr *addr) -{ - __be32 st = addr->s6_addr32[0]; - return ((st & htonl(0xFF000000)) == htonl(0xFF000000)); -} - static inline bool xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) { @@ -76,8 +69,7 @@ xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family) is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr); break; case NFPROTO_IPV6: - is_multicast = - xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr); + is_multicast = ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr); break; default: WARN_ON(1); -- cgit v1.2.3-70-g09d2 From 433029ecc62788296cacca50ceb24db90c17a4a2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 11 Feb 2018 23:28:18 +0900 Subject: netfilter: nf_conntrack_broadcast: remove useless parameter parameter protoff in nf_conntrack_broadcast_help is not used anywhere. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 3 +-- net/netfilter/nf_conntrack_broadcast.c | 1 - net/netfilter/nf_conntrack_netbios_ns.c | 5 +++-- net/netfilter/nf_conntrack_snmp.c | 5 +++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index fc39bbaf107c..32c2a94a219d 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -132,8 +132,7 @@ void nf_conntrack_helper_pernet_fini(struct net *net); int nf_conntrack_helper_init(void); void nf_conntrack_helper_fini(void); -int nf_conntrack_broadcast_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, +int nf_conntrack_broadcast_help(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index ecc3ab784633..a1086bdec242 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -20,7 +20,6 @@ #include int nf_conntrack_broadcast_help(struct sk_buff *skb, - unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int timeout) diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 496ce173f0c1..a4a59dc7cf17 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -41,9 +41,10 @@ static struct nf_conntrack_expect_policy exp_policy = { }; static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { - return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + return nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout); } static struct nf_conntrack_helper helper __read_mostly = { diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index 87b95a2c270c..2d0f8e010821 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -36,11 +36,12 @@ int (*nf_nat_snmp_hook)(struct sk_buff *skb, EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) { typeof(nf_nat_snmp_hook) nf_nat_snmp; - nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout); nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); if (nf_nat_snmp && ct->status & IPS_NAT_MASK) -- cgit v1.2.3-70-g09d2 From 2db3fec507bd822ed81c031221b97701238949dc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 13 Feb 2018 08:25:57 -0600 Subject: netfilter: ipt_ah: return boolean instead of integer Return statements in functions returning bool should use true/false instead of 1/0. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ipt_ah.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c index a787d07f6cb7..7c6c20eaf4db 100644 --- a/net/ipv4/netfilter/ipt_ah.c +++ b/net/ipv4/netfilter/ipt_ah.c @@ -47,7 +47,7 @@ static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) */ pr_debug("Dropping evil AH tinygram.\n"); par->hotdrop = true; - return 0; + return false; } return spi_match(ahinfo->spis[0], ahinfo->spis[1], -- cgit v1.2.3-70-g09d2 From f31e5f1a891f989f107e8caa6b49dd4df0e12265 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 16 Feb 2018 18:04:56 +0800 Subject: netfilter: unlock xt_table earlier in __do_replace Now it's doing cleanup_entry for oldinfo under the xt_table lock, but it's not really necessary. After the replacement job is done in xt_replace_table, oldinfo is not used elsewhere any more, and it can be freed without xt_table lock safely. The important thing is that rtnl_lock is called in some xt_target destroy, which means rtnl_lock, a big lock is used in xt_table lock, a smaller one. It usually could be the reason why a dead lock may happen. Besides, all xt_target/match checkentry is called out of xt_table lock. It's better also to move all cleanup_entry calling out of xt_table lock, just as do_replace_finish does for ebtables. Signed-off-by: Xin Long Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 3 ++- net/ipv4/netfilter/ip_tables.c | 3 ++- net/ipv6/netfilter/ip6_tables.c | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c36ffce3c812..a0c7ce76879c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -925,6 +925,8 @@ static int __do_replace(struct net *net, const char *name, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -939,7 +941,6 @@ static int __do_replace(struct net *net, const char *name, net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d4f7584d2dbe..4f7153e25e0b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1087,6 +1087,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -1100,7 +1102,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4de8ac1e5af4..6c44033decab 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1105,6 +1105,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); + xt_table_unlock(t); + get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ @@ -1118,7 +1120,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); - xt_table_unlock(t); return ret; put_module: -- cgit v1.2.3-70-g09d2 From 07a9da51b4b6aece8bc71e0b1b601fc4c3eb8b64 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:27 +0100 Subject: netfilter: x_tables: check standard verdicts in core Userspace must provide a valid verdict to the standard target. The verdict can be either a jump (signed int > 0), or a return code. Allowed return codes are either RETURN (pop from stack), NF_ACCEPT, DROP and QUEUE (latter is allowed for legacy reasons). Jump offsets (verdict > 0) are checked in more detail later on when loop-detection is performed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 5 ----- net/ipv4/netfilter/ip_tables.c | 5 ----- net/ipv6/netfilter/ip6_tables.c | 5 ----- net/netfilter/x_tables.c | 49 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 43 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index a0c7ce76879c..c9ffa884a4ee 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -334,11 +334,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last * big jump. */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4f7153e25e0b..c9b57a6bf96a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -402,11 +402,6 @@ mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last big jump. */ do { diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 6c44033decab..f46954221933 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -420,11 +420,6 @@ mark_source_chains(const struct xt_table_info *newinfo, t->verdict < 0) || visited) { unsigned int oldpos, size; - if ((strcmp(t->target.u.user.name, - XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) - return 0; - /* Return: backtrack through the last big jump. */ do { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d9deebe599ec..2e4d423e58e6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -654,6 +654,31 @@ struct compat_xt_standard_target { compat_uint_t verdict; }; +static bool verdict_ok(int verdict) +{ + if (verdict > 0) + return true; + + if (verdict < 0) { + int v = -verdict - 1; + + if (verdict == XT_RETURN) + return true; + + switch (v) { + case NF_ACCEPT: return true; + case NF_DROP: return true; + case NF_QUEUE: return true; + default: + break; + } + + return false; + } + + return false; +} + int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) @@ -675,9 +700,15 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, if (target_offset + t->u.target_size > next_offset) return -EINVAL; - if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 && - COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset) - return -EINVAL; + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { + const struct compat_xt_standard_target *st = (const void *)t; + + if (COMPAT_XT_ALIGN(target_offset + sizeof(*st)) != next_offset) + return -EINVAL; + + if (!verdict_ok(st->verdict)) + return -EINVAL; + } /* compat_xt_entry match has less strict alignment requirements, * otherwise they are identical. In case of padding differences @@ -757,9 +788,15 @@ int xt_check_entry_offsets(const void *base, if (target_offset + t->u.target_size > next_offset) return -EINVAL; - if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 && - XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset) - return -EINVAL; + if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) { + const struct xt_standard_target *st = (const void *)t; + + if (XT_ALIGN(target_offset + sizeof(*st)) != next_offset) + return -EINVAL; + + if (!verdict_ok(st->verdict)) + return -EINVAL; + } return xt_check_entry_match(elems, base + target_offset, __alignof__(struct xt_entry_match)); -- cgit v1.2.3-70-g09d2 From 472ebdcd15ebdb8ebe20474ef1ce09abcb241e7d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:28 +0100 Subject: netfilter: x_tables: check error target size too Check that userspace ERROR target (custom user-defined chains) match expected format, and the chain name is null terminated. This is irrelevant for kernel, but iptables itself relies on sane input when it dumps rules from kernel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2e4d423e58e6..f045bb4f7063 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -654,6 +654,11 @@ struct compat_xt_standard_target { compat_uint_t verdict; }; +struct compat_xt_error_target { + struct compat_xt_entry_target t; + char errorname[XT_FUNCTION_MAXNAMELEN]; +}; + static bool verdict_ok(int verdict) { if (verdict > 0) @@ -679,6 +684,12 @@ static bool verdict_ok(int verdict) return false; } +static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, + const char *msg, unsigned int msglen) +{ + return usersize == kernsize && strnlen(msg, msglen) < msglen; +} + int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) @@ -708,6 +719,12 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, if (!verdict_ok(st->verdict)) return -EINVAL; + } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { + const struct compat_xt_error_target *et = (const void *)t; + + if (!error_tg_ok(t->u.target_size, sizeof(*et), + et->errorname, sizeof(et->errorname))) + return -EINVAL; } /* compat_xt_entry match has less strict alignment requirements, @@ -796,6 +813,12 @@ int xt_check_entry_offsets(const void *base, if (!verdict_ok(st->verdict)) return -EINVAL; + } else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) { + const struct xt_error_target *et = (const void *)t; + + if (!error_tg_ok(t->u.target_size, sizeof(*et), + et->errorname, sizeof(et->errorname))) + return -EINVAL; } return xt_check_entry_match(elems, base + target_offset, -- cgit v1.2.3-70-g09d2 From 1b293e30f759b03f246baae862bdf35e57b2c39e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:29 +0100 Subject: netfilter: x_tables: move hook entry checks into core Allow followup patch to change on location instead of three. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 ++ net/ipv4/netfilter/arp_tables.c | 13 +++---------- net/ipv4/netfilter/ip_tables.c | 13 +++---------- net/ipv6/netfilter/ip6_tables.c | 13 +++---------- net/netfilter/x_tables.c | 29 +++++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1313b35c3ab7..fa0c19c328f1 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -281,6 +281,8 @@ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); +int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); + unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c9ffa884a4ee..be5821215ea0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -555,16 +555,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_ARP_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c9b57a6bf96a..29bda9484a33 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -702,16 +702,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_INET_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index f46954221933..ba3776a4d305 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -720,16 +720,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (i != repl->num_entries) goto out_free; - /* Check hooks all assigned */ - for (i = 0; i < NF_INET_NUMHOOKS; i++) { - /* Only hooks which are valid */ - if (!(repl->valid_hooks & (1 << i))) - continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) - goto out_free; - if (newinfo->underflow[i] == 0xFFFFFFFF) - goto out_free; - } + ret = xt_check_table_hooks(newinfo, repl->valid_hooks); + if (ret) + goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index f045bb4f7063..5d8ba89a8da8 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -518,6 +518,35 @@ static int xt_check_entry_match(const char *match, const char *target, return 0; } +/** xt_check_table_hooks - check hook entry points are sane + * + * @info xt_table_info to check + * @valid_hooks - hook entry points that we can enter from + * + * Validates that the hook entry and underflows points are set up. + * + * Return: 0 on success, negative errno on failure. + */ +int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks) +{ + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow)); + + for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) { + if (!(valid_hooks & (1 << i))) + continue; + + if (info->hook_entry[i] == 0xFFFFFFFF) + return -EINVAL; + if (info->underflow[i] == 0xFFFFFFFF) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL(xt_check_table_hooks); + #ifdef CONFIG_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { -- cgit v1.2.3-70-g09d2 From e816a2ce49e49e3906f614009c919334a0c6ba6a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:30 +0100 Subject: netfilter: x_tables: enforce unique and ascending entry points Harmless from kernel point of view, but iptables assumes that this is true when decoding a ruleset. iptables walks the dumped blob from kernel, and, for each entry that creates a new chain it prints out rule/chain information. Base chains (hook entry points) are thus only shown when they appear in the rule blob. One base chain that is referenced multiple times in hook blob is then only printed once. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 5d8ba89a8da8..4e6cbb38e616 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -529,10 +529,15 @@ static int xt_check_entry_match(const char *match, const char *target, */ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks) { - unsigned int i; + const char *err = "unsorted underflow"; + unsigned int i, max_uflow, max_entry; + bool check_hooks = false; BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow)); + max_entry = 0; + max_uflow = 0; + for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) { if (!(valid_hooks & (1 << i))) continue; @@ -541,9 +546,33 @@ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_ho return -EINVAL; if (info->underflow[i] == 0xFFFFFFFF) return -EINVAL; + + if (check_hooks) { + if (max_uflow > info->underflow[i]) + goto error; + + if (max_uflow == info->underflow[i]) { + err = "duplicate underflow"; + goto error; + } + if (max_entry > info->hook_entry[i]) { + err = "unsorted entry"; + goto error; + } + if (max_entry == info->hook_entry[i]) { + err = "duplicate entry"; + goto error; + } + } + max_entry = info->hook_entry[i]; + max_uflow = info->underflow[i]; + check_hooks = true; } return 0; +error: + pr_err_ratelimited("%s at hook %d\n", err, i); + return -EINVAL; } EXPORT_SYMBOL(xt_check_table_hooks); -- cgit v1.2.3-70-g09d2 From 19926968ea86a286aa6fbea16ee3f2e7442f10f0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:31 +0100 Subject: netfilter: x_tables: cap allocations at 512 mbyte Arbitrary limit, however, this still allows huge rulesets (> 1 million rules). This helps with automated fuzzer as it prevents oom-killer invocation. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 4e6cbb38e616..dc68ac49614a 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 +#define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) struct compat_delta { unsigned int offset; /* offset in kernel */ @@ -1117,7 +1118,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) struct xt_table_info *info = NULL; size_t sz = sizeof(*info) + size; - if (sz < sizeof(*info)) + if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE) return NULL; /* __GFP_NORETRY is not fully supported by kvmalloc but it should -- cgit v1.2.3-70-g09d2 From 9d5c12a7c08f67999772065afd50fb222072114e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:32 +0100 Subject: netfilter: x_tables: limit allocation requests for blob rule heads This is a very conservative limit (134217728 rules), but good enough to not trigger frequent oom from syzkaller. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index dc68ac49614a..01f8e122e74e 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -894,6 +894,9 @@ EXPORT_SYMBOL(xt_check_entry_offsets); */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { + if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int)) + return NULL; + return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); } -- cgit v1.2.3-70-g09d2 From c84ca954ac9fa67a6ce27f91f01e4451c74fd8f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:33 +0100 Subject: netfilter: x_tables: add counters allocation wrapper allows to have size checks in a single spot. This is supposed to reduce oom situations when fuzz-testing xtables. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- net/netfilter/x_tables.c | 15 +++++++++++++++ 5 files changed, 19 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index fa0c19c328f1..0bd93c589a8c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -301,6 +301,7 @@ int xt_data_to_user(void __user *dst, const void *src, void *xt_copy_counters_from_user(const void __user *user, unsigned int len, struct xt_counters_info *info, bool compat); +struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index be5821215ea0..82ba09b50fdb 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -883,7 +883,7 @@ static int __do_replace(struct net *net, const char *name, struct arpt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 29bda9484a33..4901ca6c3e09 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1045,7 +1045,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ipt_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ba3776a4d305..e84cec49b60f 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1063,7 +1063,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct ip6t_entry *iter; ret = 0; - counters = vzalloc(num_counters * sizeof(struct xt_counters)); + counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 01f8e122e74e..82b1f8f52ac6 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1290,6 +1290,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) return 0; } +struct xt_counters *xt_counters_alloc(unsigned int counters) +{ + struct xt_counters *mem; + + if (counters == 0 || counters > INT_MAX / sizeof(*mem)) + return NULL; + + counters *= sizeof(*mem); + if (counters > XT_MAX_TABLE_SIZE) + return NULL; + + return vzalloc(counters); +} +EXPORT_SYMBOL(xt_counters_alloc); + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, -- cgit v1.2.3-70-g09d2 From 9782a11efc072faaf91d4aa60e9d23553f918029 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:34 +0100 Subject: netfilter: compat: prepare xt_compat_init_offsets to return errors should have no impact, function still always returns 0. This patch is only to ease review. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- net/bridge/netfilter/ebtables.c | 10 ++++++++-- net/ipv4/netfilter/arp_tables.c | 10 +++++++--- net/ipv4/netfilter/ip_tables.c | 8 ++++++-- net/ipv6/netfilter/ip6_tables.c | 10 +++++++--- net/netfilter/x_tables.c | 4 +++- 6 files changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 0bd93c589a8c..7bd896dc78df 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -510,7 +510,7 @@ void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); -void xt_compat_init_offsets(u_int8_t af, unsigned int number); +int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 02c4b409d317..217aa79f7b2a 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1819,10 +1819,14 @@ static int compat_table_info(const struct ebt_table_info *info, { unsigned int size = info->entries_size; const void *entries = info->entries; + int ret; newinfo->entries_size = size; - xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); + ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); + if (ret) + return ret; + return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -2245,7 +2249,9 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); - xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); + if (ret < 0) + goto out_unlock; ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 82ba09b50fdb..aaafdbd15ad3 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -769,7 +769,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(NFPROTO_ARP, info->number); + ret = xt_compat_init_offsets(NFPROTO_ARP, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1156,7 +1158,7 @@ static int translate_compat_table(struct xt_table_info **pinfo, struct compat_arpt_entry *iter0; struct arpt_replace repl; unsigned int size; - int ret = 0; + int ret; info = *pinfo; entry0 = *pentry0; @@ -1165,7 +1167,9 @@ static int translate_compat_table(struct xt_table_info **pinfo, j = 0; xt_compat_lock(NFPROTO_ARP); - xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 4901ca6c3e09..f9063513f9d1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -933,7 +933,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(AF_INET, info->number); + ret = xt_compat_init_offsets(AF_INET, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1407,7 +1409,9 @@ translate_compat_table(struct net *net, j = 0; xt_compat_lock(AF_INET); - xt_compat_init_offsets(AF_INET, compatr->num_entries); + ret = xt_compat_init_offsets(AF_INET, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e84cec49b60f..3c36a4c77f29 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -950,7 +950,9 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; - xt_compat_init_offsets(AF_INET6, info->number); + ret = xt_compat_init_offsets(AF_INET6, info->number); + if (ret) + return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1414,7 +1416,7 @@ translate_compat_table(struct net *net, struct compat_ip6t_entry *iter0; struct ip6t_replace repl; unsigned int size; - int ret = 0; + int ret; info = *pinfo; entry0 = *pentry0; @@ -1423,7 +1425,9 @@ translate_compat_table(struct net *net, j = 0; xt_compat_lock(AF_INET6); - xt_compat_init_offsets(AF_INET6, compatr->num_entries); + ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries); + if (ret) + goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 82b1f8f52ac6..e878c85a9268 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -632,10 +632,12 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); -void xt_compat_init_offsets(u_int8_t af, unsigned int number) +int xt_compat_init_offsets(u8 af, unsigned int number) { xt[af].number = number; xt[af].cur = 0; + + return 0; } EXPORT_SYMBOL(xt_compat_init_offsets); -- cgit v1.2.3-70-g09d2 From 7d7d7e02111e9a4dc9d0658597f528f815d820fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:35 +0100 Subject: netfilter: compat: reject huge allocation requests no need to bother even trying to allocating huge compat offset arrays, such ruleset is rejected later on anyway becaus we refuse to allocate overly large rule blobs. However, compat translation happens before blob allocation, so we should add a check there too. This is supposed to help with fuzzing by avoiding oom-killer. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e878c85a9268..33724b08b8f0 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -582,14 +582,8 @@ int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; - if (!xp->compat_tab) { - if (!xp->number) - return -EINVAL; - xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); - if (!xp->compat_tab) - return -ENOMEM; - xp->cur = 0; - } + if (WARN_ON(!xp->compat_tab)) + return -ENOMEM; if (xp->cur >= xp->number) return -EINVAL; @@ -634,6 +628,22 @@ EXPORT_SYMBOL_GPL(xt_compat_calc_jump); int xt_compat_init_offsets(u8 af, unsigned int number) { + size_t mem; + + if (!number || number > (INT_MAX / sizeof(struct compat_delta))) + return -EINVAL; + + if (WARN_ON(xt[af].compat_tab)) + return -EINVAL; + + mem = sizeof(struct compat_delta) * number; + if (mem > XT_MAX_TABLE_SIZE) + return -ENOMEM; + + xt[af].compat_tab = vmalloc(mem); + if (!xt[af].compat_tab) + return -ENOMEM; + xt[af].number = number; xt[af].cur = 0; -- cgit v1.2.3-70-g09d2 From 89370860686a54fc0642c7ae68213cc1fc6d8e04 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:36 +0100 Subject: netfilter: x_tables: make sure compat af mutex is held Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 33724b08b8f0..7521e8a72c06 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -582,6 +582,8 @@ int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + if (WARN_ON(!xp->compat_tab)) return -ENOMEM; @@ -599,6 +601,8 @@ EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + if (xt[af].compat_tab) { vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; @@ -630,6 +634,8 @@ int xt_compat_init_offsets(u8 af, unsigned int number) { size_t mem; + WARN_ON(!mutex_is_locked(&xt[af].compat_mutex)); + if (!number || number > (INT_MAX / sizeof(struct compat_delta))) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 0d7df906a0e78079a02108b06d32c3ef2238ad25 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:37 +0100 Subject: netfilter: x_tables: ensure last rule in base chain matches underflow/policy Harmless from kernel point of view, but again iptables assumes that this is true when decoding ruleset coming from kernel. If a (syzkaller generated) ruleset doesn't have the underflow/policy stored as the last rule in the base chain, then iptables will abort() because it doesn't find the chain policy. libiptc assumes that the policy is the last rule in the basechain, which is only true for iptables-generated rulesets. Unfortunately this needs code duplication -- the functions need the struct layout of the rule head, but that is different for ip/ip6/arptables. NB: pr_warn could be pr_debug but in case this break rulesets somehow its useful to know why blob was rejected. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 17 ++++++++++++++++- net/ipv4/netfilter/ip_tables.c | 17 ++++++++++++++++- net/ipv6/netfilter/ip6_tables.c | 17 ++++++++++++++++- 3 files changed, 48 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index aaafdbd15ad3..f366ff1cfc19 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -309,10 +309,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e = entry0 + pos; + unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; + depth = 0; + last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -343,6 +346,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; + if (depth) + --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -367,6 +372,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; + + if (entry0 + newpos != arpt_next_entry(e)) + ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -377,8 +385,15 @@ static int mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } + if (depth == 0) + last_pos = pos; + } +next: + if (last_pos != newinfo->underflow[hook]) { + pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", + last_pos, newinfo->underflow[hook], hook); + return 0; } -next: ; } return 1; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f9063513f9d1..2362ca2c9e0c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -378,10 +378,13 @@ mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e = entry0 + pos; + unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; + depth = 0; + last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -410,6 +413,8 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; + if (depth) + --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -434,6 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; + + if (entry0 + newpos != ipt_next_entry(e)) + ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -444,8 +452,15 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } + if (depth == 0) + last_pos = pos; + } +next: + if (last_pos != newinfo->underflow[hook]) { + pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", + last_pos, newinfo->underflow[hook], hook); + return 0; } -next: ; } return 1; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 3c36a4c77f29..004508753abc 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -396,10 +396,13 @@ mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e = entry0 + pos; + unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; + depth = 0; + last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -428,6 +431,8 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; + if (depth) + --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -452,6 +457,9 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; + + if (entry0 + newpos != ip6t_next_entry(e)) + ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -462,8 +470,15 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } + if (depth == 0) + last_pos = pos; + } +next: + if (last_pos != newinfo->underflow[hook]) { + pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", + last_pos, newinfo->underflow[hook], hook); + return 0; } -next: ; } return 1; } -- cgit v1.2.3-70-g09d2 From 3427b2ab63faccafe774ea997fc2da7faf690c5a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 1 Mar 2018 18:58:38 -0800 Subject: netfilter: make xt_rateest hash table per net As suggested by Eric, we need to make the xt_rateest hash table and its lock per netns to reduce lock contentions. Cc: Florian Westphal Cc: Eric Dumazet Cc: Pablo Neira Ayuso Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/xt_rateest.h | 4 +- net/netfilter/xt_RATEEST.c | 91 +++++++++++++++++++++++++++----------- net/netfilter/xt_rateest.c | 10 ++--- 3 files changed, 72 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index b1db13772554..832ab69efda5 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -21,7 +21,7 @@ struct xt_rateest { struct net_rate_estimator __rcu *rate_est; }; -struct xt_rateest *xt_rateest_lookup(const char *name); -void xt_rateest_put(struct xt_rateest *est); +struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name); +void xt_rateest_put(struct net *net, struct xt_rateest *est); #endif /* _XT_RATEEST_H */ diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 141c295191f6..dec843cadf46 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -14,15 +14,21 @@ #include #include #include +#include #include #include #include -static DEFINE_MUTEX(xt_rateest_mutex); - #define RATEEST_HSIZE 16 -static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly; + +struct xt_rateest_net { + struct mutex hash_lock; + struct hlist_head hash[RATEEST_HSIZE]; +}; + +static unsigned int xt_rateest_id; + static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) @@ -31,21 +37,23 @@ static unsigned int xt_rateest_hash(const char *name) (RATEEST_HSIZE - 1); } -static void xt_rateest_hash_insert(struct xt_rateest *est) +static void xt_rateest_hash_insert(struct xt_rateest_net *xn, + struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); - hlist_add_head(&est->list, &rateest_hash[h]); + hlist_add_head(&est->list, &xn->hash[h]); } -static struct xt_rateest *__xt_rateest_lookup(const char *name) +static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, + const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); - hlist_for_each_entry(est, &rateest_hash[h], list) { + hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; @@ -55,20 +63,23 @@ static struct xt_rateest *__xt_rateest_lookup(const char *name) return NULL; } -struct xt_rateest *xt_rateest_lookup(const char *name) +struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; - mutex_lock(&xt_rateest_mutex); - est = __xt_rateest_lookup(name); - mutex_unlock(&xt_rateest_mutex); + mutex_lock(&xn->hash_lock); + est = __xt_rateest_lookup(xn, name); + mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); -void xt_rateest_put(struct xt_rateest *est) +void xt_rateest_put(struct net *net, struct xt_rateest *est) { - mutex_lock(&xt_rateest_mutex); + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + + mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); @@ -78,7 +89,7 @@ void xt_rateest_put(struct xt_rateest *est) */ kfree_rcu(est, rcu); } - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); @@ -98,6 +109,7 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { + struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { @@ -108,10 +120,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); - mutex_lock(&xt_rateest_mutex); - est = __xt_rateest_lookup(info->name); + mutex_lock(&xn->hash_lock); + est = __xt_rateest_lookup(xn, info->name); if (est) { - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. @@ -119,7 +131,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { - xt_rateest_put(est); + xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; @@ -148,14 +160,14 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) goto err2; info->est = est; - xt_rateest_hash_insert(est); - mutex_unlock(&xt_rateest_mutex); + xt_rateest_hash_insert(xn, est); + mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: - mutex_unlock(&xt_rateest_mutex); + mutex_unlock(&xn->hash_lock); return ret; } @@ -163,7 +175,7 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; - xt_rateest_put(info->est); + xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg __read_mostly = { @@ -178,19 +190,46 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = { .me = THIS_MODULE, }; -static int __init xt_rateest_tg_init(void) +static __net_init int xt_rateest_net_init(struct net *net) +{ + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + int i; + + mutex_init(&xn->hash_lock); + for (i = 0; i < ARRAY_SIZE(xn->hash); i++) + INIT_HLIST_HEAD(&xn->hash[i]); + return 0; +} + +static void __net_exit xt_rateest_net_exit(struct net *net) { - unsigned int i; + struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); + int i; + + for (i = 0; i < ARRAY_SIZE(xn->hash); i++) + WARN_ON_ONCE(!hlist_empty(&xn->hash[i])); +} - for (i = 0; i < ARRAY_SIZE(rateest_hash); i++) - INIT_HLIST_HEAD(&rateest_hash[i]); +static struct pernet_operations xt_rateest_net_ops = { + .init = xt_rateest_net_init, + .exit = xt_rateest_net_exit, + .id = &xt_rateest_id, + .size = sizeof(struct xt_rateest_net), +}; + +static int __init xt_rateest_tg_init(void) +{ + int err = register_pernet_subsys(&xt_rateest_net_ops); + if (err) + return err; return xt_register_target(&xt_rateest_tg_reg); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_target(&xt_rateest_tg_reg); + unregister_pernet_subsys(&xt_rateest_net_ops); } diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 755d2f6693a2..bf77326861af 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -95,13 +95,13 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) } ret = -ENOENT; - est1 = xt_rateest_lookup(info->name1); + est1 = xt_rateest_lookup(par->net, info->name1); if (!est1) goto err1; est2 = NULL; if (info->flags & XT_RATEEST_MATCH_REL) { - est2 = xt_rateest_lookup(info->name2); + est2 = xt_rateest_lookup(par->net, info->name2); if (!est2) goto err2; } @@ -111,7 +111,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) return 0; err2: - xt_rateest_put(est1); + xt_rateest_put(par->net, est1); err1: return ret; } @@ -120,9 +120,9 @@ static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { struct xt_rateest_match_info *info = par->matchinfo; - xt_rateest_put(info->est1); + xt_rateest_put(par->net, info->est1); if (info->est2) - xt_rateest_put(info->est2); + xt_rateest_put(par->net, info->est2); } static struct xt_match xt_rateest_mt_reg __read_mostly = { -- cgit v1.2.3-70-g09d2 From 010eacd968a73ddcb8592b14c1607e1004120ede Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 Mar 2018 14:59:54 +0100 Subject: netfilter: xt_limit: Spelling s/maxmum/maximum/ Signed-off-by: Geert Uytterhoeven Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_limit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 55d18cd67635..9f098ecb2449 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -46,7 +46,7 @@ MODULE_ALIAS("ip6t_limit"); See Alexey's formal explanation in net/sched/sch_tbf.c. - To get the maxmum range, we multiply by this factor (ie. you get N + To get the maximum range, we multiply by this factor (ie. you get N credits per jiffy). We want to allow a rate as low as 1 per day (slowest userspace tool allows), which means CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ -- cgit v1.2.3-70-g09d2 From 415a13296a1a49639cabf9d2fe92267810caa47a Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Mar 2018 15:49:59 -0600 Subject: xfrm_policy: use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 12bd415d349e..2b7babb66175 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1740,7 +1740,7 @@ static void xfrm_pcpu_work_fn(struct work_struct *work) void xfrm_policy_cache_flush(void) { struct xfrm_dst *old; - bool found = 0; + bool found = false; int cpu; might_sleep(); -- cgit v1.2.3-70-g09d2 From 30855ffc29b972d3316e2adcf8029c00c36fad60 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 1 Mar 2018 15:23:28 +0300 Subject: net: Make account struct net to memcg The patch adds SLAB_ACCOUNT to flags of net_cachep cache, which enables accounting of struct net memory to memcg kmem. Since number of net_namespaces may be significant, user want to know, how much there were consumed, and control. Note, that we do not account net_generic to the same memcg, where net was accounted, moreover, we don't do this at all (*). We do not want the situation, when single memcg memory deficit prevents us to register new pernet_operations. (*)Even despite there is !current process accounting already available in linux-next. See kmalloc_memcg() there for the details. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 690e78c6af45..c340d5cfbdec 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -882,7 +882,7 @@ static int __init net_ns_init(void) #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, - SLAB_PANIC, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Create workqueue for cleanup */ netns_wq = create_singlethread_workqueue("netns"); -- cgit v1.2.3-70-g09d2 From ed63afb8a318f6b3558d76afba7809daee4f28e5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:18 +0800 Subject: sctp: add support for PR-SCTP Information for sendmsg This patch is to add support for PR-SCTP Information for sendmsg, as described in section 5.3.7 of RFC6458. With this option, you can specify pr_policy and pr_value for user data in sendmsg. It's also a necessary send info for sctp_sendv. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 15 +++++++++++++++ net/sctp/socket.c | 31 ++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 03e92dda1813..d40a2a329888 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2112,6 +2112,7 @@ struct sctp_cmsgs { struct sctp_initmsg *init; struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; + struct sctp_prinfo *prinfo; }; /* Structure for tracking memory objects */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 4c4db14786bd..0dd1f82a4fa8 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -260,6 +260,19 @@ struct sctp_nxtinfo { sctp_assoc_t nxt_assoc_id; }; +/* 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo + */ +struct sctp_prinfo { + __u16 pr_policy; + __u32 pr_value; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -293,6 +306,8 @@ typedef enum sctp_cmsg_type { #define SCTP_RCVINFO SCTP_RCVINFO SCTP_NXTINFO, /* 5.3.6 SCTP Next Receive Information Structure */ #define SCTP_NXTINFO SCTP_NXTINFO + SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ +#define SCTP_PRINFO SCTP_PRINFO } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7fa76031bb08..fdde697b37e7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1644,6 +1644,12 @@ static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs, srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id; } + if (cmsgs->prinfo) { + srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value; + SCTP_PR_SET_POLICY(srinfo->sinfo_flags, + cmsgs->prinfo->pr_policy); + } + sflags = srinfo->sinfo_flags; if (!sflags && msg_len) return 0; @@ -1901,9 +1907,12 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, sinfo->sinfo_ppid = asoc->default_ppid; sinfo->sinfo_context = asoc->default_context; sinfo->sinfo_assoc_id = sctp_assoc2id(asoc); + + if (!cmsgs->prinfo) + sinfo->sinfo_flags = asoc->default_flags; } - if (!cmsgs->srinfo) + if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; } @@ -7749,6 +7758,26 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; + case SCTP_PRINFO: + /* SCTP Socket API Extension + * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_PRINFO struct sctp_prinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo))) + return -EINVAL; + + cmsgs->prinfo = CMSG_DATA(cmsg); + if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK) + return -EINVAL; + + if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) + cmsgs->prinfo->pr_value = 0; + break; default: return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 2c0dbaa0c43d04d8d6daf52adb724c5789676b15 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:19 +0800 Subject: sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg This patch is to add support for Destination IPv4/6 Address options for sendmsg, as described in section 5.3.9/10 of RFC6458. With this option, you can provide more than one destination addrs to sendmsg when creating asoc, like sctp_connectx. It's also a necessary send info for sctp_sendv. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 6 ++++ net/sctp/socket.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index d40a2a329888..ec6e46b7e119 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2113,6 +2113,7 @@ struct sctp_cmsgs { struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; + struct msghdr *addrs_msg; }; /* Structure for tracking memory objects */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 0dd1f82a4fa8..a1bc35098033 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -308,6 +308,12 @@ typedef enum sctp_cmsg_type { #define SCTP_NXTINFO SCTP_NXTINFO SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ #define SCTP_PRINFO SCTP_PRINFO + SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure (RESERVED) */ +#define SCTP_AUTHINFO SCTP_AUTHINFO + SCTP_DSTADDRV4, /* 5.3.9 SCTP Destination IPv4 Address Structure */ +#define SCTP_DSTADDRV4 SCTP_DSTADDRV4 + SCTP_DSTADDRV6, /* 5.3.10 SCTP Destination IPv6 Address Structure */ +#define SCTP_DSTADDRV6 SCTP_DSTADDRV6 } sctp_cmsg_t; /* diff --git a/net/sctp/socket.c b/net/sctp/socket.c index fdde697b37e7..067b57a330b1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1676,6 +1676,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct net *net = sock_net(sk); struct sctp_association *asoc; enum sctp_scope scope; + struct cmsghdr *cmsg; int err = -EINVAL; *tp = NULL; @@ -1741,6 +1742,67 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, goto free; } + if (!cmsgs->addrs_msg) + return 0; + + /* sendv addr list parse */ + for_each_cmsghdr(cmsg, cmsgs->addrs_msg) { + struct sctp_transport *transport; + struct sctp_association *old; + union sctp_addr _daddr; + int dlen; + + if (cmsg->cmsg_level != IPPROTO_SCTP || + (cmsg->cmsg_type != SCTP_DSTADDRV4 && + cmsg->cmsg_type != SCTP_DSTADDRV6)) + continue; + + daddr = &_daddr; + memset(daddr, 0, sizeof(*daddr)); + dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); + if (cmsg->cmsg_type == SCTP_DSTADDRV4) { + if (dlen < sizeof(struct in_addr)) + goto free; + + dlen = sizeof(struct in_addr); + daddr->v4.sin_family = AF_INET; + daddr->v4.sin_port = htons(asoc->peer.port); + memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); + } else { + if (dlen < sizeof(struct in6_addr)) + goto free; + + dlen = sizeof(struct in6_addr); + daddr->v6.sin6_family = AF_INET6; + daddr->v6.sin6_port = htons(asoc->peer.port); + memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen); + } + err = sctp_verify_addr(sk, daddr, sizeof(*daddr)); + if (err) + goto free; + + old = sctp_endpoint_lookup_assoc(ep, daddr, &transport); + if (old && old != asoc) { + if (old->state >= SCTP_STATE_ESTABLISHED) + err = -EISCONN; + else + err = -EALREADY; + goto free; + } + + if (sctp_endpoint_is_peeled_off(ep, daddr)) { + err = -EADDRNOTAVAIL; + goto free; + } + + transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, + SCTP_UNKNOWN); + if (!transport) { + err = -ENOMEM; + goto free; + } + } + return 0; free: @@ -7778,6 +7840,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; + case SCTP_DSTADDRV4: + case SCTP_DSTADDRV6: + /* SCTP Socket API Extension + * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV4 struct in_addr + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_DSTADDRV6 struct in6_addr + */ + cmsgs->addrs_msg = my_msg; + break; default: return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 4910280503f3af2857d5aa77e35b22d93a8960a8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 5 Mar 2018 20:44:20 +0800 Subject: sctp: add support for snd flag SCTP_SENDALL process in sendmsg This patch is to add support for snd flag SCTP_SENDALL process in sendmsg, as described in section 5.3.4 of RFC6458. With this flag, you can send the same data to all the asocs of this sk once. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ net/sctp/socket.c | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a1bc35098033..e94b6d297ad9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -284,6 +284,8 @@ enum sctp_sinfo_flags { SCTP_ADDR_OVER = (1 << 1), /* Override the primary destination. */ SCTP_ABORT = (1 << 2), /* Send an ABORT message to the peer. */ SCTP_SACK_IMMEDIATELY = (1 << 3), /* SACK should be sent without delay. */ + /* 2 bits here have been used by SCTP_PR_SCTP_MASK */ + SCTP_SENDALL = (1 << 6), SCTP_NOTIFICATION = MSG_NOTIFICATION, /* Next message is not user msg but notification. */ SCTP_EOF = MSG_FIN, /* Initiate graceful shutdown process. */ }; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 067b57a330b1..7d3476a4860d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1820,6 +1820,10 @@ static int sctp_sendmsg_check_sflags(struct sctp_association *asoc, if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) return -EPIPE; + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) && + !sctp_state(asoc, ESTABLISHED)) + return 0; + if (sflags & SCTP_EOF) { pr_debug("%s: shutting down association:%p\n", __func__, asoc); sctp_primitive_SHUTDOWN(net, asoc, NULL); @@ -2007,6 +2011,29 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) lock_sock(sk); + /* SCTP_SENDALL process */ + if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) { + list_for_each_entry(asoc, &ep->asocs, asocs) { + err = sctp_sendmsg_check_sflags(asoc, sflags, msg, + msg_len); + if (err == 0) + continue; + if (err < 0) + goto out_unlock; + + sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs); + + err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, + NULL, sinfo); + if (err < 0) + goto out_unlock; + + iov_iter_revert(&msg->msg_iter, err); + } + + goto out_unlock; + } + /* Get and check or create asoc */ if (daddr) { asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport); @@ -7792,8 +7819,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->srinfo->sinfo_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; @@ -7816,8 +7843,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->sinfo->snd_flags & ~(SCTP_UNORDERED | SCTP_ADDR_OVER | - SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK | - SCTP_ABORT | SCTP_EOF)) + SCTP_SACK_IMMEDIATELY | SCTP_SENDALL | + SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF)) return -EINVAL; break; case SCTP_PRINFO: -- cgit v1.2.3-70-g09d2 From c9efb15a9981400c797665dc4844ab562ed3c424 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Mar 2018 15:56:14 -0600 Subject: tipc: bcast: use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Acked-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/bcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 37892b3909af..f3711176be45 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -574,5 +574,5 @@ void tipc_nlist_purge(struct tipc_nlist *nl) { tipc_dest_list_purge(&nl->list); nl->remote = 0; - nl->local = 0; + nl->local = false; } -- cgit v1.2.3-70-g09d2 From 9a21ac943240d0353b726495d08e377a257ace52 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Mar 2018 16:11:54 -0600 Subject: ipv6: ndisc: use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0a19ce3a6f7f..8af5eef464c1 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -527,7 +527,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, } if (!dev->addr_len) - inc_opt = 0; + inc_opt = false; if (inc_opt) optlen += ndisc_opt_addr_space(dev, NDISC_NEIGHBOUR_ADVERTISEMENT); -- cgit v1.2.3-70-g09d2 From 334e6413134bf8335ec93a9e60213cbc61ef7a62 Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Wed, 7 Mar 2018 09:40:57 -0800 Subject: sock: Fix SO_ZEROCOPY switch case Fix the SO_ZEROCOPY switch case on sock_setsockopt() avoiding the ret values to be overwritten by the one set on the default case. Fixes: 28190752c7092 ("sock: permit SO_ZEROCOPY on PF_RDS socket") Signed-off-by: Jesus Sanchez-Palencia Acked-by: Willem de Bruijn Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/core/sock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 507d8c6c4319..27f218bba43f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1062,8 +1062,9 @@ set_rcvbuf: ret = -EINVAL; else sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); - break; } + break; + default: ret = -ENOPROTOOPT; break; -- cgit v1.2.3-70-g09d2 From d40a126b16ea851f858f97b0e214d97c8355cecb Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 6 Mar 2018 07:22:33 -0800 Subject: rds: refactor zcopy code into rds_message_zcopy_from_user Move the large block of code predicated on zcopy from rds_message_copy_from_user into a new function, rds_message_zcopy_from_user() Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/rds/message.c | 108 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 116cf87ccb89..c36edbb92c34 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -333,14 +333,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, - bool zcopy) +int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { - unsigned long to_copy, nbytes; unsigned long sg_off; struct scatterlist *sg; int ret = 0; int length = iov_iter_count(from); + int total_copied = 0; + struct sk_buff *skb; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -350,54 +350,66 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ - if (zcopy) { - int total_copied = 0; - struct sk_buff *skb; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) - return -ENOMEM; - BUILD_BUG_ON(sizeof(skb->cb) < - max_t(int, sizeof(struct rds_znotifier), - sizeof(struct rds_zcopy_cookies))); - rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); - if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, - length)) { - ret = -ENOMEM; + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) + return -ENOMEM; + BUILD_BUG_ON(sizeof(skb->cb) < max_t(int, sizeof(struct rds_znotifier), + sizeof(struct rds_zcopy_cookies))); + rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, + length)) { + ret = -ENOMEM; + goto err; + } + while (iov_iter_count(from)) { + struct page *pages; + size_t start; + ssize_t copied; + + copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, + 1, &start); + if (copied < 0) { + struct mmpin *mmp; + int i; + + for (i = 0; i < rm->data.op_nents; i++) + put_page(sg_page(&rm->data.op_sg[i])); + mmp = &rm->data.op_mmp_znotifier->z_mmp; + mm_unaccount_pinned_pages(mmp); + ret = -EFAULT; goto err; } - while (iov_iter_count(from)) { - struct page *pages; - size_t start; - ssize_t copied; - - copied = iov_iter_get_pages(from, &pages, PAGE_SIZE, - 1, &start); - if (copied < 0) { - struct mmpin *mmp; - int i; - - for (i = 0; i < rm->data.op_nents; i++) - put_page(sg_page(&rm->data.op_sg[i])); - mmp = &rm->data.op_mmp_znotifier->z_mmp; - mm_unaccount_pinned_pages(mmp); - ret = -EFAULT; - goto err; - } - total_copied += copied; - iov_iter_advance(from, copied); - length -= copied; - sg_set_page(sg, pages, copied, start); - rm->data.op_nents++; - sg++; - } - WARN_ON_ONCE(length != 0); - return ret; + total_copied += copied; + iov_iter_advance(from, copied); + length -= copied; + sg_set_page(sg, pages, copied, start); + rm->data.op_nents++; + sg++; + } + WARN_ON_ONCE(length != 0); + return ret; err: - consume_skb(skb); - rm->data.op_mmp_znotifier = NULL; - return ret; - } /* zcopy */ + consume_skb(skb); + rm->data.op_mmp_znotifier = NULL; + return ret; +} + +int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, + bool zcopy) +{ + unsigned long to_copy, nbytes; + unsigned long sg_off; + struct scatterlist *sg; + int ret = 0; + + rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); + + /* now allocate and copy in the data payload. */ + sg = rm->data.op_sg; + sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ + + if (zcopy) + return rds_message_zcopy_from_user(rm, from); while (iov_iter_count(from)) { if (!sg_page(sg)) { -- cgit v1.2.3-70-g09d2 From 9426bbc6de99b8649d897b94e8f5916b58195643 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Tue, 6 Mar 2018 07:22:34 -0800 Subject: rds: use list structure to track information for zerocopy completion notification Commit 401910db4cd4 ("rds: deliver zerocopy completion notification with data") removes support fo r zerocopy completion notification on the sk_error_queue, thus we no longer need to track the cookie information in sk_buff structures. This commit removes the struct sk_buff_head rs_zcookie_queue by a simpler list that results in a smaller memory footprint as well as more efficient memory_allocation time. Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/rds/af_rds.c | 6 ++--- net/rds/message.c | 77 +++++++++++++++++++++++++++++++++++-------------------- net/rds/rds.h | 23 ++++++++++++----- net/rds/recv.c | 23 ++++++++++++----- 4 files changed, 85 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index f7126108a811..ab751a150f70 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -77,7 +77,7 @@ static int rds_release(struct socket *sock) rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); - __skb_queue_purge(&rs->rs_zcookie_queue); + rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue); spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); @@ -180,7 +180,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, } if (!list_empty(&rs->rs_recv_queue) || !list_empty(&rs->rs_notify_queue) || - !skb_queue_empty(&rs->rs_zcookie_queue)) + !list_empty(&rs->rs_zcookie_queue.zcookie_head)) mask |= (EPOLLIN | EPOLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); @@ -515,7 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_recv_queue); INIT_LIST_HEAD(&rs->rs_notify_queue); INIT_LIST_HEAD(&rs->rs_cong_list); - skb_queue_head_init(&rs->rs_zcookie_queue); + rds_message_zcopy_queue_init(&rs->rs_zcookie_queue); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; diff --git a/net/rds/message.c b/net/rds/message.c index c36edbb92c34..90dcdcfe9f62 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -48,7 +48,6 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_GEN_NUM] = sizeof(u32), }; - void rds_message_addref(struct rds_message *rm) { rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount)); @@ -56,9 +55,9 @@ void rds_message_addref(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_addref); -static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) +static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie) { - struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb; + struct rds_zcopy_cookies *ck = &info->zcookies; int ncookies = ck->num; if (ncookies == RDS_MAX_ZCOOKIES) @@ -68,38 +67,61 @@ static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie) return true; } +struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) +{ + return container_of(znotif, struct rds_msg_zcopy_info, znotif); +} + +void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q) +{ + unsigned long flags; + LIST_HEAD(copy); + struct rds_msg_zcopy_info *info, *tmp; + + spin_lock_irqsave(&q->lock, flags); + list_splice(&q->zcookie_head, ©); + INIT_LIST_HEAD(&q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); + + list_for_each_entry_safe(info, tmp, ©, rs_zcookie_next) { + list_del(&info->rs_zcookie_next); + kfree(info); + } +} + static void rds_rm_zerocopy_callback(struct rds_sock *rs, struct rds_znotifier *znotif) { - struct sk_buff *skb, *tail; - unsigned long flags; - struct sk_buff_head *q; + struct rds_msg_zcopy_info *info; + struct rds_msg_zcopy_queue *q; u32 cookie = znotif->z_cookie; struct rds_zcopy_cookies *ck; + struct list_head *head; + unsigned long flags; + mm_unaccount_pinned_pages(&znotif->z_mmp); q = &rs->rs_zcookie_queue; spin_lock_irqsave(&q->lock, flags); - tail = skb_peek_tail(q); - - if (tail && skb_zcookie_add(tail, cookie)) { - spin_unlock_irqrestore(&q->lock, flags); - mm_unaccount_pinned_pages(&znotif->z_mmp); - consume_skb(rds_skb_from_znotifier(znotif)); - /* caller invokes rds_wake_sk_sleep() */ - return; + head = &q->zcookie_head; + if (!list_empty(head)) { + info = list_entry(head, struct rds_msg_zcopy_info, + rs_zcookie_next); + if (info && rds_zcookie_add(info, cookie)) { + spin_unlock_irqrestore(&q->lock, flags); + kfree(rds_info_from_znotifier(znotif)); + /* caller invokes rds_wake_sk_sleep() */ + return; + } } - skb = rds_skb_from_znotifier(znotif); - ck = (struct rds_zcopy_cookies *)skb->cb; + info = rds_info_from_znotifier(znotif); + ck = &info->zcookies; memset(ck, 0, sizeof(*ck)); - WARN_ON(!skb_zcookie_add(skb, cookie)); - - __skb_queue_tail(q, skb); + WARN_ON(!rds_zcookie_add(info, cookie)); + list_add_tail(&q->zcookie_head, &info->rs_zcookie_next); spin_unlock_irqrestore(&q->lock, flags); /* caller invokes rds_wake_sk_sleep() */ - - mm_unaccount_pinned_pages(&znotif->z_mmp); } /* @@ -340,7 +362,7 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) int ret = 0; int length = iov_iter_count(from); int total_copied = 0; - struct sk_buff *skb; + struct rds_msg_zcopy_info *info; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -350,12 +372,11 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) sg = rm->data.op_sg; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) return -ENOMEM; - BUILD_BUG_ON(sizeof(skb->cb) < max_t(int, sizeof(struct rds_znotifier), - sizeof(struct rds_zcopy_cookies))); - rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb); + INIT_LIST_HEAD(&info->rs_zcookie_next); + rm->data.op_mmp_znotifier = &info->znotif; if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp, length)) { ret = -ENOMEM; @@ -389,7 +410,7 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) WARN_ON_ONCE(length != 0); return ret; err: - consume_skb(skb); + kfree(info); rm->data.op_mmp_znotifier = NULL; return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 33b16353d8f3..74cd27c661de 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -357,16 +357,27 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_FLUSH 8 struct rds_znotifier { - struct list_head z_list; struct mmpin z_mmp; u32 z_cookie; }; -#define RDS_ZCOPY_SKB(__skb) ((struct rds_znotifier *)&((__skb)->cb[0])) +struct rds_msg_zcopy_info { + struct list_head rs_zcookie_next; + union { + struct rds_znotifier znotif; + struct rds_zcopy_cookies zcookies; + }; +}; -static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z) +struct rds_msg_zcopy_queue { + struct list_head zcookie_head; + spinlock_t lock; /* protects zcookie_head queue */ +}; + +static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { - return container_of((void *)z, struct sk_buff, cb); + spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->zcookie_head); } struct rds_message { @@ -603,8 +614,7 @@ struct rds_sock { /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; - - struct sk_buff_head rs_zcookie_queue; + struct rds_msg_zcopy_queue rs_zcookie_queue; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -803,6 +813,7 @@ void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); +void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { diff --git a/net/rds/recv.c b/net/rds/recv.c index d50747725221..de50e2126e40 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -579,9 +579,10 @@ out: static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) { - struct sk_buff *skb; - struct sk_buff_head *q = &rs->rs_zcookie_queue; + struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; + struct rds_msg_zcopy_info *info = NULL; struct rds_zcopy_cookies *done; + unsigned long flags; if (!msg->msg_control) return false; @@ -590,16 +591,24 @@ static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) msg->msg_controllen < CMSG_SPACE(sizeof(*done))) return false; - skb = skb_dequeue(q); - if (!skb) + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&q->zcookie_head)) { + info = list_entry(q->zcookie_head.next, + struct rds_msg_zcopy_info, rs_zcookie_next); + list_del(&info->rs_zcookie_next); + } + spin_unlock_irqrestore(&q->lock, flags); + if (!info) return false; - done = (struct rds_zcopy_cookies *)skb->cb; + done = &info->zcookies; if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), done)) { - skb_queue_head(q, skb); + spin_lock_irqsave(&q->lock, flags); + list_add(&info->rs_zcookie_next, &q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); return false; } - consume_skb(skb); + kfree(info); return true; } -- cgit v1.2.3-70-g09d2 From a366e300ae9fc466d333e6d8f2bc5d58ed248041 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Mar 2018 08:43:19 -0800 Subject: ip6mr: remove synchronize_rcu() in favor of SOCK_RCU_FREE Kirill found that recently added synchronize_rcu() call in ip6mr_sk_done() was slowing down netns dismantle and posted a patch to use it only if the socket was found. I instead suggested to get rid of this call, and use instead SOCK_RCU_FREE We might later change IPv4 side to use the same technique and unify both stacks. IPv4 does not use synchronize_rcu() but has a call_rcu() that could be replaced by SOCK_RCU_FREE. Tested: time for i in {1..1000}; do unshare -n /bin/false;done Before : real 7m18.911s After : real 10.187s Fixes: 8571ab479a6e ("ip6mr: Make mroute_sk rcu-based") Signed-off-by: Eric Dumazet Reported-by: Kirill Tkhai Cc: Yuval Mintz Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/ip6mr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2a38f9de45d3..7345bd6c4b7d 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1443,6 +1443,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); + sock_set_flag(sk, SOCK_RCU_FREE); net->ipv6.devconf_all->mc_forwarding++; } write_unlock_bh(&mrt_lock); @@ -1472,6 +1473,10 @@ int ip6mr_sk_done(struct sock *sk) if (sk == rtnl_dereference(mrt->mroute_sk)) { write_lock_bh(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); + /* Note that mroute_sk had SOCK_RCU_FREE set, + * so the RCU grace period before sk freeing + * is guaranteed by sk_destruct() + */ net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -1485,7 +1490,6 @@ int ip6mr_sk_done(struct sock *sk) } } rtnl_unlock(); - synchronize_rcu(); return err; } -- cgit v1.2.3-70-g09d2 From 67ae686b3e1422a819500f35152cdd74f6dab6ce Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 8 Mar 2018 12:52:25 +0200 Subject: devlink: Change dpipe/resource get privileges Let dpipe/resource be retrieved by unprivileged users. Signed-off-by: Arkadi Sharshevsky Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/devlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 1b5bf0d1cee9..f23e5ed7c90f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2744,22 +2744,22 @@ static const struct genl_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, .doit = devlink_nl_cmd_dpipe_table_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, .doit = devlink_nl_cmd_dpipe_entries_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, .doit = devlink_nl_cmd_dpipe_headers_get, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, @@ -2779,8 +2779,8 @@ static const struct genl_ops devlink_nl_ops[] = { .cmd = DEVLINK_CMD_RESOURCE_DUMP, .doit = devlink_nl_cmd_resource_dump, .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ }, { .cmd = DEVLINK_CMD_RELOAD, -- cgit v1.2.3-70-g09d2 From 459d153d9916ea48b1550bbb6f2959dc03bff011 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 6 Mar 2018 18:11:14 +0100 Subject: net/sched: cls_flower: Add support to handle first frag as match field Allow setting firstfrag as matching option in tc flower classifier. # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_flags firstfrag action mirred egress redirect dev eth1 Signed-off-by: Pieter Jansen van Vuuren Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 1 + net/sched/cls_flower.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7cafb26df555..be05e66c167b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -475,6 +475,7 @@ enum { enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), }; /* Match-all classifier */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 7d0ce2c40f93..d964e60c730e 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -511,6 +511,9 @@ static int fl_set_key_flags(struct nlattr **tb, fl_set_key_flag(key, mask, flags_key, flags_mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_set_key_flag(key, mask, flags_key, flags_mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); return 0; } @@ -1130,6 +1133,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) fl_get_key_flag(flags_key, flags_mask, &key, &mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); + fl_get_key_flag(flags_key, flags_mask, &key, &mask, + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST, + FLOW_DIS_FIRST_FRAG); _key = cpu_to_be32(key); _mask = cpu_to_be32(mask); -- cgit v1.2.3-70-g09d2 From 997266a4a02de882de11c7abecad0a3a42ac84b3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:06 +0300 Subject: net: Convert ip6 tables pernet_operations The pernet_operations: ip6table_filter_net_ops ip6table_mangle_net_ops ip6table_nat_net_ops ip6table_raw_net_ops ip6table_security_net_ops have exit methods, which call ip6t_unregister_table(). ip6table_filter_net_ops has init method registering filter table. Since there must not be in-flight ipv6 packets at the time of pernet_operations execution and since pernet_operations don't send ipv6 packets each other, these pernet_operations are safe to be async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6table_filter.c | 1 + net/ipv6/netfilter/ip6table_mangle.c | 1 + net/ipv6/netfilter/ip6table_nat.c | 1 + net/ipv6/netfilter/ip6table_raw.c | 1 + net/ipv6/netfilter/ip6table_security.c | 1 + 5 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 1343077dde93..06561c84c0bc 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -87,6 +87,7 @@ static void __net_exit ip6table_filter_net_exit(struct net *net) static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, .exit = ip6table_filter_net_exit, + .async = true, }; static int __init ip6table_filter_init(void) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index b0524b18c4fb..a11e25936b45 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -107,6 +107,7 @@ static void __net_exit ip6table_mangle_net_exit(struct net *net) static struct pernet_operations ip6table_mangle_net_ops = { .exit = ip6table_mangle_net_exit, + .async = true, }; static int __init ip6table_mangle_init(void) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 47306e45a80a..4475fd300bb6 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,6 +131,7 @@ static void __net_exit ip6table_nat_net_exit(struct net *net) static struct pernet_operations ip6table_nat_net_ops = { .exit = ip6table_nat_net_exit, + .async = true, }; static int __init ip6table_nat_init(void) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 710fa0806c37..a88f3b1995b1 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -75,6 +75,7 @@ static void __net_exit ip6table_raw_net_exit(struct net *net) static struct pernet_operations ip6table_raw_net_ops = { .exit = ip6table_raw_net_exit, + .async = true, }; static int __init ip6table_raw_init(void) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index cf26ccb04056..320048c008dc 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -74,6 +74,7 @@ static void __net_exit ip6table_security_net_exit(struct net *net) static struct pernet_operations ip6table_security_net_ops = { .exit = ip6table_security_net_exit, + .async = true, }; static int __init ip6table_security_init(void) -- cgit v1.2.3-70-g09d2 From 649b9826cc733fe410207d28d94984354e023b21 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:14 +0300 Subject: net: Convert xfrm_user_net_ops These pernet_operations create and destroy net::xfrm::nlsk socket of NETLINK_XFRM. There is only entry point, where it's dereferenced, it's xfrm_user_rcv_msg(). There is no in-kernel senders to this socket. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7f52b8eb177d..aff2e84ec761 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3258,6 +3258,7 @@ static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) static struct pernet_operations xfrm_user_net_ops = { .init = xfrm_user_net_init, .exit_batch = xfrm_user_net_exit, + .async = true, }; static int __init xfrm_user_init(void) -- cgit v1.2.3-70-g09d2 From c7c5e435e44e585f84472f88b323553c794844c4 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:23 +0300 Subject: net: Convert nf_tables_net_ops These pernet_operations looks nicely separated per-net. Exit method unregisters net's nf tables objects. We allow them be executed in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_tables_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 558593e6a0a3..8e19c86d1aa6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6596,6 +6596,7 @@ static void __net_exit nf_tables_exit_net(struct net *net) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .exit = nf_tables_exit_net, + .async = true, }; static int __init nf_tables_module_init(void) -- cgit v1.2.3-70-g09d2 From 5a8e9be69d163fcd3442c4ed8fbaaaf0c7f464e6 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:33 +0300 Subject: net: Convert nfnetlink_net_ops These pernet_operations create and destroy net::nfnl socket of NETLINK_NETFILTER code. There are no other places, where such type the socket is created, except these pernet_operations. It seem other pernet_operations depending on CONFIG_NETFILTER_NETLINK send messages to this socket. So, we mark it async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 03ead8a9e90c..84fc4954862d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -566,6 +566,7 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, + .async = true, }; static int __init nfnetlink_init(void) -- cgit v1.2.3-70-g09d2 From cf51503a03f7ff89d8152a3132843330be90729e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:42 +0300 Subject: net: Convert nfnl_acct_ops These pernet_operations look closed in themself, and there are no other users of net::nfnl_acct_list outside. They are safe to be executed for several net namespaces in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_acct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 88d427f9f9e6..8d9f18bb8840 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -515,6 +515,7 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, + .async = true, }; static int __init nfnl_acct_init(void) -- cgit v1.2.3-70-g09d2 From ffdf72bc1eed9a56aa266bde2d425f85b4d4ca18 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:39:51 +0300 Subject: net: Convert cttimeout_ops These pernet_operations also look closed in themself. Exit method touch only per-net structures, so it's safe to execute them for several net namespaces in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_cttimeout.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 95b04702a655..6819300f7fb7 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -586,6 +586,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .exit = cttimeout_net_exit, + .async = true, }; static int __init cttimeout_init(void) -- cgit v1.2.3-70-g09d2 From 74f26bbf505a8e1bb9a6ad9726d5bbe625607783 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:00 +0300 Subject: net: Convert nfnl_log_net_ops These pernet_operations create and destroy /proc entries. Also, exit method unsets nfulnl_logger. The logger is not set by default, and it becomes bound via userspace request. So, they look safe to be made async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 7b46aa4c478d..b21ef79849a1 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1108,6 +1108,7 @@ static struct pernet_operations nfnl_log_net_ops = { .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), + .async = true, }; static int __init nfnetlink_log_init(void) -- cgit v1.2.3-70-g09d2 From bd54dce0796522b669f5be45b96e02fa94738de9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:09 +0300 Subject: net: Convert nfnl_queue_net_ops These pernet_operations register and unregister net::nf::queue_handler and /proc entry. The handler is accessed only under RCU, so this looks safe to convert them. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nfnetlink_queue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8bba23160a68..59e2833c17f1 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1528,6 +1528,7 @@ static struct pernet_operations nfnl_queue_net_ops = { .exit_batch = nfnl_queue_net_exit_batch, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), + .async = true, }; static int __init nfnetlink_queue_init(void) -- cgit v1.2.3-70-g09d2 From 59d269731e2bcfb72d29f2e0281d6631aef1ff8e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:19 +0300 Subject: net: Convert pg_net_ops These pernet_operations create per-net pktgen threads and /proc entries. These pernet subsys looks closed in itself, and there are no pernet_operations outside this file, which are interested in the threads. Init and/or exit methods look safe to be executed in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/pktgen.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b8ab5c829511..d81bddd1bb80 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3851,6 +3851,7 @@ static struct pernet_operations pg_net_ops = { .exit = pg_net_exit, .id = &pg_net_id, .size = sizeof(struct pktgen_net), + .async = true, }; static int __init pg_init(void) -- cgit v1.2.3-70-g09d2 From 93623f2b00297f0eb4437fc2e47ad0d4fd58a3ad Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:28 +0300 Subject: net: Convert arptable_filter_net_ops These pernet_operations unregister net::ipv4::arptable_filter. Another net/pernet_operations do not send arp packets to foreign net namespaces. So, we mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/arptable_filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 8f8713b4388f..49c2490193ae 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -65,6 +65,7 @@ static void __net_exit arptable_filter_net_exit(struct net *net) static struct pernet_operations arptable_filter_net_ops = { .exit = arptable_filter_net_exit, + .async = true, }; static int __init arptable_filter_init(void) -- cgit v1.2.3-70-g09d2 From 7ba81869d1f6f985e64031e161b4c009cc15be99 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:36 +0300 Subject: net: Convert iptable_mangle_net_ops These pernet_operations unregister net::ipv4::iptable_mangle table. Another net/pernet_operations do not send ipv4 packets to foreign net namespaces. So, we mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_mangle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index dea138ca8925..f6074059531a 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -113,6 +113,7 @@ static void __net_exit iptable_mangle_net_exit(struct net *net) static struct pernet_operations iptable_mangle_net_ops = { .exit = iptable_mangle_net_exit, + .async = true, }; static int __init iptable_mangle_init(void) -- cgit v1.2.3-70-g09d2 From 06a8a67b5dac661a0f0b865926ceab2e62d512cd Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:45 +0300 Subject: net: Convert iptable_nat_net_ops These pernet_operations unregister net::ipv4::nat_table table. Another net/pernet_operations do not send ipv4 packets to foreign net namespaces. So, we mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_nat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index 0f7255cc65ee..b771af74be79 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,6 +129,7 @@ static void __net_exit iptable_nat_net_exit(struct net *net) static struct pernet_operations iptable_nat_net_ops = { .exit = iptable_nat_net_exit, + .async = true, }; static int __init iptable_nat_init(void) -- cgit v1.2.3-70-g09d2 From 65f828c35261c46d848e9d789ccf29c2fe7127a8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:40:58 +0300 Subject: net: Convert iptable_raw_net_ops These pernet_operations unregister net::ipv4::iptable_raw table. Another net/pernet_operations do not send ipv4 packets to foreign net namespaces. So, we mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_raw.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 960625aabf04..963753e50842 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -76,6 +76,7 @@ static void __net_exit iptable_raw_net_exit(struct net *net) static struct pernet_operations iptable_raw_net_ops = { .exit = iptable_raw_net_exit, + .async = true, }; static int __init iptable_raw_init(void) -- cgit v1.2.3-70-g09d2 From 8dbc6e2eaeccccecd23888bd14c10a2c384c280f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:41:07 +0300 Subject: net: Convert iptable_security_net_ops These pernet_operations unregister net::ipv4::iptable_security table. Another net/pernet_operations do not send ipv4 packets to foreign net namespaces. So, we mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/iptable_security.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index e5379fe57b64..c40d6b3d8b6a 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -76,6 +76,7 @@ static void __net_exit iptable_security_net_exit(struct net *net) static struct pernet_operations iptable_security_net_ops = { .exit = iptable_security_net_exit, + .async = true, }; static int __init iptable_security_init(void) -- cgit v1.2.3-70-g09d2 From e8a95ad46378162f22c9293bde276d7c4030f31e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:41:16 +0300 Subject: net: Convert ipv4_net_ops These pernet_operations register and unregister bunch of nf_conntrack_l4proto. Exit method unregisters related sysctl, init method calls init_net and get_net_proto. The whole builtin_l4proto4 array has pretty simple init_net and get_net_proto methods. The first one register sysctl table, the second one is just RO memory dereference. So, these pernet_operations are safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index b50721d9d30e..6531f69db010 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -399,6 +399,7 @@ static struct pernet_operations ipv4_net_ops = { .exit = ipv4_net_exit, .id = &conntrack4_net_id, .size = sizeof(struct conntrack4_net), + .async = true, }; static int __init nf_conntrack_l3proto_ipv4_init(void) -- cgit v1.2.3-70-g09d2 From 1fd2c55705aebac31bf8f759c0750dc3c796cf47 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 7 Mar 2018 12:41:23 +0300 Subject: net: Convet ipv6_net_ops These pernet_operations are similar to ipv4_net_ops. They are safe to be async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 663827ee3cf8..ba54bb3bd1e4 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -401,6 +401,7 @@ static struct pernet_operations ipv6_net_ops = { .exit = ipv6_net_exit, .id = &conntrack6_net_id, .size = sizeof(struct conntrack6_net), + .async = true, }; static int __init nf_conntrack_l3proto_ipv6_init(void) -- cgit v1.2.3-70-g09d2 From 46e371f0e78a82186a83cbcb4f4b8850417c7dd5 Mon Sep 17 00:00:00 2001 From: William Tu Date: Wed, 7 Mar 2018 15:38:48 -0800 Subject: openvswitch: fix vport packet length check. When sending a packet to a tunnel device, the dev's hard_header_len could be larger than the skb->len in function packet_length(). In the case of ip6gretap/erspan, hard_header_len = LL_MAX_HEADER + t_hlen, which is around 180, and an ARP packet sent to this tunnel has skb->len = 42. This causes the 'unsign int length' to become super large because it is negative value, causing the later ovs_vport_send to drop it due to over-mtu size. The patch fixes it by setting it to 0. Signed-off-by: William Tu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index b6c8524032a0..f81c1d0ddff4 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -464,10 +464,10 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, return 0; } -static unsigned int packet_length(const struct sk_buff *skb, - struct net_device *dev) +static int packet_length(const struct sk_buff *skb, + struct net_device *dev) { - unsigned int length = skb->len - dev->hard_header_len; + int length = skb->len - dev->hard_header_len; if (!skb_vlan_tag_present(skb) && eth_type_vlan(skb->protocol)) @@ -478,7 +478,7 @@ static unsigned int packet_length(const struct sk_buff *skb, * account for 802.1ad. e.g. is_skb_forwardable(). */ - return length; + return length > 0 ? length : 0; } void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) -- cgit v1.2.3-70-g09d2 From 50db64b090d0f457a3266fe80e2c841eba621b9d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Mar 2018 12:36:04 +0300 Subject: net/ncsi: use kfree_skb() instead of kfree() We're supposed to use kfree_skb() to free these sk_buffs. Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ncsi/ncsi-netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index d4201665a580..b73239b76349 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -183,7 +183,7 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); if (!hdr) { - kfree(skb); + kfree_skb(skb); return -EMSGSIZE; } @@ -204,7 +204,7 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) err: genlmsg_cancel(skb, hdr); - kfree(skb); + kfree_skb(skb); return rc; } -- cgit v1.2.3-70-g09d2 From 054f34da60dfa46457ca39dd111bb2b393575dab Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Mar 2018 12:36:28 +0300 Subject: net/ncsi: unlock on error in ncsi_set_interface_nl() There are two error paths which are missing unlocks in this function. Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/ncsi/ncsi-netlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index b73239b76349..05fcfb4fbe1d 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -299,6 +299,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) package = np; if (!package) { /* The user has set a package that does not exist */ + spin_unlock_irqrestore(&ndp->lock, flags); return -ERANGE; } @@ -317,6 +318,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) /* The user has set a channel that does not exist on this * package */ + spin_unlock_irqrestore(&ndp->lock, flags); netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", channel_id); return -ERANGE; -- cgit v1.2.3-70-g09d2 From 496c7f3caed5c56ba4ab6767a9cf7e7aa8bd8f41 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 8 Mar 2018 18:56:14 +0800 Subject: rds: rds_message_zcopy_from_user() can be static Fixes: d40a126b16ea ("rds: refactor zcopy code into rds_message_zcopy_from_user") Signed-off-by: Fengguang Wu Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/message.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 90dcdcfe9f62..3c610d5fe7ae 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -355,7 +355,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) +static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { unsigned long sg_off; struct scatterlist *sg; -- cgit v1.2.3-70-g09d2 From 571e6776add4f499661e761e03e46ec0f6d66243 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 8 Mar 2018 19:37:30 +0800 Subject: rds: rds_info_from_znotifier() can be static Fixes: 9426bbc6de99 ("rds: use list structure to track information for zerocopy completion notification") Signed-off-by: Fengguang Wu Signed-off-by: David S. Miller --- net/rds/message.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 3c610d5fe7ae..4462e4c9bd7d 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -67,7 +67,7 @@ static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie) return true; } -struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) +static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif) { return container_of(znotif, struct rds_msg_zcopy_info, znotif); } -- cgit v1.2.3-70-g09d2 From 84a1d9c4820080bebcbd413a845076dcb62f45fa Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 8 Mar 2018 15:45:03 +0000 Subject: net: ethtool: extend RXNFC API to support RSS spreading of filter matches We use a two-step process to configure a filter with RSS spreading. First, the RSS context is allocated and configured using ETHTOOL_SRSSH; this returns an identifier (rss_context) which can then be passed to subsequent invocations of ETHTOOL_SRXCLSRLINS to specify that the offset from the RSS indirection table lookup should be added to the queue number (ring_cookie) when delivering the packet. Drivers for devices which can only use the indirection table entry directly (not add it to a base queue number) should reject rule insertions combining RSS with a nonzero ring_cookie. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/ethtool.h | 5 ++++ include/uapi/linux/ethtool.h | 32 +++++++++++++++++----- net/core/ethtool.c | 64 +++++++++++++++++++++++++++++++++----------- 3 files changed, 80 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2ec41a7eb54f..ebe41811ed34 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -371,6 +371,11 @@ struct ethtool_ops { u8 *hfunc); int (*set_rxfh)(struct net_device *, const u32 *indir, const u8 *key, const u8 hfunc); + int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, + u8 *hfunc, u32 rss_context); + int (*set_rxfh_context)(struct net_device *, const u32 *indir, + const u8 *key, const u8 hfunc, + u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 44a0b675a6bc..20da156aaf64 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -914,12 +914,15 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW * @data: Command-dependent value * @fs: Flow classification rule + * @rss_context: RSS context to be affected * @rule_cnt: Number of rules to be affected * @rule_locs: Array of used rule locations * * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following - * structure fields must not be used. + * structure fields must not be used, except that if @flow_type includes + * the %FLOW_RSS flag, then @rss_context determines which RSS context to + * act on. * * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues * on return. @@ -931,7 +934,9 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * set in @data then special location values should not be used. * * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an - * existing rule on entry and @fs contains the rule on return. + * existing rule on entry and @fs contains the rule on return; if + * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is + * filled with the RSS context ID associated with the rule. * * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the * user buffer for @rule_locs on entry. On return, @data is the size @@ -942,7 +947,11 @@ static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie) * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update. * @fs.@location either specifies the location to use or is a special * location value with %RX_CLS_LOC_SPECIAL flag set. On return, - * @fs.@location is the actual rule location. + * @fs.@location is the actual rule location. If @fs.@flow_type + * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to + * use for flow spreading traffic which matches this rule. The value + * from the rxfh indirection table will be added to @fs.@ring_cookie + * to choose which ring to deliver to. * * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an * existing rule on entry. @@ -963,7 +972,10 @@ struct ethtool_rxnfc { __u32 flow_type; __u64 data; struct ethtool_rx_flow_spec fs; - __u32 rule_cnt; + union { + __u32 rule_cnt; + __u32 rss_context; + }; __u32 rule_locs[0]; }; @@ -990,7 +1002,11 @@ struct ethtool_rxfh_indir { /** * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key. * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH - * @rss_context: RSS context identifier. + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command + * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. * @indir_size: On entry, the array size of the user buffer for the * indirection table, which may be zero, or (for %ETHTOOL_SRSSH), * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH, @@ -1009,7 +1025,8 @@ struct ethtool_rxfh_indir { * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested * and a @indir_size of zero means the indir table should be reset to default - * values. An hfunc of zero means that hash function setting is not requested. + * values (if @rss_context == 0) or that the RSS context should be deleted. + * An hfunc of zero means that hash function setting is not requested. */ struct ethtool_rxfh { __u32 cmd; @@ -1021,6 +1038,7 @@ struct ethtool_rxfh { __u32 rsvd32; __u32 rss_config[0]; }; +#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff #define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff /** @@ -1635,6 +1653,8 @@ static inline int ethtool_validate_duplex(__u8 duplex) /* Flag to enable additional fields in struct ethtool_rx_flow_spec */ #define FLOW_EXT 0x80000000 #define FLOW_MAC_EXT 0x40000000 +/* Flag to enable RSS spreading of traffic matching rule (nfc only) */ +#define FLOW_RSS 0x20000000 /* L3-L4 network traffic flow hash options */ #define RXH_L2DA (1 << 1) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3f89c76d5c24..157cd9efa4be 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1022,6 +1022,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; + /* If FLOW_RSS was requested then user-space must be using the + * new definition, as FLOW_RSS is newer. + */ + if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { + info_size = sizeof(info); + if (copy_from_user(&info, useraddr, info_size)) + return -EFAULT; + } + if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) @@ -1251,9 +1260,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; rxfh.indir_size = dev_indir_size; rxfh.key_size = dev_key_size; @@ -1276,7 +1287,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (rxfh.rss_context) + ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, + &dev_hfunc, + rxfh.rss_context); + else + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); if (ret) goto out; @@ -1306,6 +1322,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *hkey = NULL; u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); + bool delete = false; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; @@ -1319,9 +1336,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || - rxfh.rsvd8[2] || rxfh.rsvd32) + if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; + /* Most drivers don't handle rss_context, check it's 0 as well */ + if (rxfh.rss_context && !ops->set_rxfh_context) + return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key or function. @@ -1346,7 +1365,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; - /* rxfh.indir_size == 0 means reset the indir table to default. + /* rxfh.indir_size == 0 means reset the indir table to default (master + * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && @@ -1359,9 +1379,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (ret) goto out; } else if (rxfh.indir_size == 0) { - indir = (u32 *)rss_config; - for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + if (rxfh.rss_context == 0) { + indir = (u32 *)rss_config; + for (i = 0; i < dev_indir_size; i++) + indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + } else { + delete = true; + } } if (rxfh.key_size) { @@ -1374,15 +1398,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); + if (rxfh.rss_context) + ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, + &rxfh.rss_context, delete); + else + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); if (ret) goto out; - /* indicate whether rxfh was set to default */ - if (rxfh.indir_size == 0) - dev->priv_flags &= ~IFF_RXFH_CONFIGURED; - else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) - dev->priv_flags |= IFF_RXFH_CONFIGURED; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), + &rxfh.rss_context, sizeof(rxfh.rss_context))) + ret = -EFAULT; + + if (!rxfh.rss_context) { + /* indicate whether rxfh was set to default */ + if (rxfh.indir_size == 0) + dev->priv_flags &= ~IFF_RXFH_CONFIGURED; + else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) + dev->priv_flags |= IFF_RXFH_CONFIGURED; + } out: kfree(rss_config); -- cgit v1.2.3-70-g09d2 From 79134e6ce2c9d1a00eab4d98cb48f975dd2474cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Mar 2018 12:51:41 -0800 Subject: net: do not create fallback tunnels for non-default namespaces fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0, ip6tnl0, ip6gre0) are automatically created when the corresponding module is loaded. These tunnels are also automatically created when a new network namespace is created, at a great cost. In many cases, netns are used for isolation purposes, and these extra network devices are a waste of resources. We are using thousands of netns per host, and hit the netns creation/delete bottleneck a lot. (Many thanks to Kirill for recent work on this) Add a new sysctl so that we can opt-out from this automatic creation. Note that these tunnels are still created for the initial namespace, to be the least intrusive for typical setups. Tested: lpk43:~# cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m37.521s user 0m0.886s sys 7m7.084s lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m4.761s user 0m0.851s sys 1m8.343s lpk43:~# Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 12 ++++++++++++ include/linux/netdevice.h | 7 +++++++ include/net/ip_tunnels.h | 2 ++ net/core/sysctl_net_core.c | 12 ++++++++++++ net/ipv4/ip_tunnel.c | 20 ++++++++++++-------- net/ipv6/ip6_gre.c | 4 +++- net/ipv6/ip6_tunnel.c | 2 ++ net/ipv6/sit.c | 5 ++++- 8 files changed, 54 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 35c62f522754..5992602469d8 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -270,6 +270,18 @@ optmem_max Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence of struct cmsghdr structures with appended data. +fb_tunnels_only_for_init_net +---------------------------- + +Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0, +sit0, ip6tnl0, ip6gre0) are automatically created when a new +network namespace is created, if corresponding tunnel is present +in initial network namespace. +If set to 1, these devices are not automatically created, and +user space is responsible for creating them if needed. + +Default : 0 (for compatibility reasons) + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ------------------------------------------------------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 95a613a7cc1c..9711108c3916 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -585,6 +585,13 @@ struct netdev_queue { #endif } ____cacheline_aligned_in_smp; +extern int sysctl_fb_tunnels_only_for_init_net; + +static inline bool net_has_fallback_tunnels(const struct net *net) +{ + return net == &init_net || !sysctl_fb_tunnels_only_for_init_net; +} + static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cbe5addb9293..540a4b4417bf 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -180,8 +180,10 @@ struct tnl_ptk_info { struct ip_tunnel_net { struct net_device *fb_tunnel_dev; + struct rtnl_link_ops *rtnl_link_ops; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; struct ip_tunnel __rcu *collect_md_tun; + int type; }; static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d714f65782b7..4f47f92459cc 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS; static int net_msg_warn; /* Unused, but still a sysctl */ +int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0; +EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "fb_tunnels_only_for_init_net", + .data = &sysctl_fb_tunnels_only_for_init_net, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 602597dfc395..5fcb17cb426b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, struct net_device *dev; int t_hlen; - BUG_ON(!itn->fb_tunnel_dev); - dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms); + dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); @@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); - BUG_ON(!itn->fb_tunnel_dev); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { @@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) p->o_key = 0; } - t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); + t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { @@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct ip_tunnel_parm parms; unsigned int i; + itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); - if (!ops) { + if (!ops || !net_has_fallback_tunnels(net)) { + struct ip_tunnel_net *it_init_net; + + it_init_net = net_generic(&init_net, ip_tnl_net_id); + itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } @@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); + itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); @@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, +static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, + struct list_head *head, struct rtnl_link_ops *ops) { - struct net *net = dev_net(itn->fb_tunnel_dev); struct net_device *dev, *aux; int h; @@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(itn, &list, ops); + ip_tunnel_destroy(net, itn, &list, ops); } unregister_netdevice_many(&list); rtnl_unlock(); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 18a3dfbd0300..7d8775c9570d 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -236,7 +236,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, return t; dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) + if (dev && dev->flags & IFF_UP) return netdev_priv(dev); return NULL; @@ -1472,6 +1472,8 @@ static int __net_init ip6gre_init_net(struct net *net) struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); int err; + if (!net_has_fallback_tunnels(net)) + return 0; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0", NET_NAME_UNKNOWN, ip6gre_tunnel_setup); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 56c4967f1868..5c045fa407da 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2205,6 +2205,8 @@ static int __net_init ip6_tnl_init_net(struct net *net) ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a9c4ac6efe22..8a4f8fddd812 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); - if (dev == sitn->fb_tunnel_dev) { + if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; @@ -1835,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net) sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; + if (!net_has_fallback_tunnels(net)) + return 0; + sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); -- cgit v1.2.3-70-g09d2 From d04e6990c948a3315ea8eca5979ebea48cda56f4 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 8 Mar 2018 16:59:17 -0500 Subject: net sched actions: update Add/Delete action API with new argument Introduce a new function argument to carry total attributes size for correct allocation of skb in event messages. Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++- net/sched/act_api.c | 21 +++++++++++++-------- net/sched/cls_api.c | 3 ++- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9c2f22695025..88c1f99bae46 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -166,7 +166,8 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, struct netlink_ext_ack *extack); + struct list_head *actions, size_t *attr_size, + struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a54fa7b8c217..3de0e0610200 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -741,7 +741,8 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, struct netlink_ext_ack *extack) + struct list_head *actions, size_t *attr_size, + struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -994,12 +995,13 @@ err_out: static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, struct netlink_ext_ack *extack) + u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, + GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -1032,6 +1034,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; + size_t attr_size = 0; LIST_HEAD(actions); ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); @@ -1059,7 +1062,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, extack); + ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; return ret; @@ -1072,12 +1075,13 @@ err: static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, struct netlink_ext_ack *extack) + u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int err = 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, + GFP_KERNEL); if (!skb) return -ENOBUFS; @@ -1099,15 +1103,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int ovr, struct netlink_ext_ack *extack) { + size_t attr_size = 0; int ret = 0; LIST_HEAD(actions); ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - extack); + &attr_size, extack); if (ret) return ret; - return tcf_add_notify(net, n, &actions, portid, extack); + return tcf_add_notify(net, n, &actions, portid, attr_size, extack); } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 19f9f421d5b7..ec5fe8ec0c3e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1433,6 +1433,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, #ifdef CONFIG_NET_CLS_ACT { struct tc_action *act; + size_t attr_size = 0; if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], @@ -1450,7 +1451,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, extack); + &actions, &attr_size, extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3-70-g09d2 From 4e76e75d6aba83f35d55b50f66a6768af1657563 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 8 Mar 2018 16:59:19 -0500 Subject: net sched actions: calculate add/delete event message size Introduce routines to calculate size of the shared tc netlink attributes and the full message size including netlink header and tc service header. Update add/delete action logic to have the size for event messages, the size is passed to tcf_add_notify() and tcf_del_notify() where the notification message is being allocated and constructed. Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_api.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3de0e0610200..57cf37145282 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -109,6 +109,42 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_idr_release); +static size_t tcf_action_shared_attrs_size(const struct tc_action *act) +{ + u32 cookie_len = 0; + + if (act->act_cookie) + cookie_len = nla_total_size(act->act_cookie->len); + + return nla_total_size(0) /* action number nested */ + + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + + cookie_len /* TCA_ACT_COOKIE */ + + nla_total_size(0) /* TCA_ACT_STATS nested */ + /* TCA_STATS_BASIC */ + + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) + /* TCA_STATS_QUEUE */ + + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + + nla_total_size(0) /* TCA_OPTIONS nested */ + + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ +} + +static size_t tcf_action_full_attrs_size(size_t sz) +{ + return NLMSG_HDRLEN /* struct nlmsghdr */ + + sizeof(struct tcamsg) + + nla_total_size(0) /* TCA_ACT_TAB nested */ + + sz; +} + +static size_t tcf_action_fill_size(const struct tc_action *act) +{ + size_t sz = tcf_action_shared_attrs_size(act); + + if (act->ops->get_fill_size) + return act->ops->get_fill_size(act) + sz; + return sz; +} + static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { @@ -746,6 +782,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; + size_t sz = 0; int err; int i; @@ -761,11 +798,14 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, goto err; } act->order = i; + sz += tcf_action_fill_size(act); if (ovr) act->tcfa_refcnt++; list_add_tail(&act->list, actions); } + *attr_size = tcf_action_full_attrs_size(sz); + /* Remove the temp refcnt which was necessary to protect against * destroying an existing action which was being replaced */ @@ -1056,9 +1096,12 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, goto err; } act->order = i; + attr_size += tcf_action_fill_size(act); list_add_tail(&act->list, &actions); } + attr_size = tcf_action_full_attrs_size(attr_size); + if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ -- cgit v1.2.3-70-g09d2 From 9c5c9c573700a0a4a4afa431da3e7d21f2992627 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 8 Mar 2018 16:59:20 -0500 Subject: net sched actions: implement get_fill_size routine in act_gact Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_gact.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 74563254e676..88fbb8403565 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -217,6 +217,19 @@ static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static size_t tcf_gact_get_fill_size(const struct tc_action *act) +{ + size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */ + +#ifdef CONFIG_GACT_PROB + if (to_gact(act)->tcfg_ptype) + /* TCA_GACT_PROB */ + sz += nla_total_size(sizeof(struct tc_gact_p)); +#endif + + return sz; +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -227,6 +240,7 @@ static struct tc_action_ops act_gact_ops = { .init = tcf_gact_init, .walk = tcf_gact_walker, .lookup = tcf_gact_search, + .get_fill_size = tcf_gact_get_fill_size, .size = sizeof(struct tcf_gact), }; -- cgit v1.2.3-70-g09d2 From 35951393bbffa1ce351438bee264b48347c8cbb7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 8 Mar 2018 23:43:40 -0600 Subject: pktgen: Remove VLA usage In preparation to enabling -Wvla, remove VLA usage and replace it with a fixed-length array instead. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d81bddd1bb80..e2d6ae3b290b 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -907,7 +907,7 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) { size_t copy = min_t(size_t, count, 1023); - char tb[copy + 1]; + char tb[1024]; if (copy_from_user(tb, user_buffer, copy)) return -EFAULT; tb[copy] = 0; -- cgit v1.2.3-70-g09d2 From f5426250a6ecfd1e9b2d5e0daf07565f664aa67d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Mar 2018 10:39:24 +0100 Subject: net: introduce IFF_NO_RX_HANDLER Some network devices - notably ipvlan slave - are not compatible with any kind of rx_handler. Currently the hook can be installed but any configuration (bridge, bond, macsec, ...) is nonfunctional. This change allocates a priv_flag bit to mark such devices and explicitly forbid installing a rx_handler if such bit is set. The new bit is used by ipvlan slave device. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 2 ++ include/linux/netdevice.h | 3 +++ net/core/dev.c | 3 +++ 3 files changed, 8 insertions(+) (limited to 'net') diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 23fd5ab180e8..743d37fb034a 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -604,6 +604,8 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, */ memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); + dev->priv_flags |= IFF_NO_RX_HANDLER; + err = register_netdevice(dev); if (err < 0) return err; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9711108c3916..5fbb9f1da7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1397,6 +1397,7 @@ struct net_device_ops { * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device + * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1425,6 +1426,7 @@ enum netdev_priv_flags { IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, + IFF_NO_RX_HANDLER = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1452,6 +1454,7 @@ enum netdev_priv_flags { #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_MACSEC IFF_MACSEC +#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER /** * struct net_device - The DEVICE structure. diff --git a/net/core/dev.c b/net/core/dev.c index e5b8d42b6410..259abb1515d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4351,6 +4351,9 @@ int netdev_rx_handler_register(struct net_device *dev, if (netdev_is_rx_handler_busy(dev)) return -EBUSY; + if (dev->priv_flags & IFF_NO_RX_HANDLER) + return -EINVAL; + /* Note: rx_handler_data must be set before rx_handler */ rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); rcu_assign_pointer(dev->rx_handler, rx_handler); -- cgit v1.2.3-70-g09d2 From bbfa047a25c379e467536c5ed2ba00c0cb602ca1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 12 Mar 2018 11:09:33 -0400 Subject: ipv6: Use ip6_multipath_hash_policy() in rt6_multipath_hash(). Make use of the new helper. Suggested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f0ae58424c45..81711e3e2604 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1846,7 +1846,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, struct flow_keys hash_keys; u32 mhash; - switch (net->ipv6.sysctl.multipath_hash_policy) { + switch (ip6_multipath_hash_policy(net)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; -- cgit v1.2.3-70-g09d2 From bdf08fc5412045f7648a49791d98cd04f72c1c1f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 11 Mar 2018 17:27:56 +0100 Subject: rds: remove redundant variable 'sg_off' Variable sg_off is assigned a value but it is never read, hence it is redundant and can be removed. Cleans up clang warning: net/rds/message.c:373:2: warning: Value stored to 'sg_off' is never read Signed-off-by: Colin Ian King Acked-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/message.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4462e4c9bd7d..a35f76971984 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -357,7 +357,6 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from) { - unsigned long sg_off; struct scatterlist *sg; int ret = 0; int length = iov_iter_count(from); @@ -370,7 +369,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * * now allocate and copy in the data payload. */ sg = rm->data.op_sg; - sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) -- cgit v1.2.3-70-g09d2 From 678f4bda35483473ae7ad674ece4c7c43a000888 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Sun, 11 Mar 2018 22:12:04 +0100 Subject: net: llc: drop VLA in llc_sap_mcast() Avoid a VLA[1] by using a real constant expression instead of a variable. The compiler should be able to optimize the original code and avoid using an actual VLA. Anyway this change is useful because it will avoid a false positive with -Wvla, it might also help the compiler generating better code. [1] https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Salvatore Mesoraca Signed-off-by: David S. Miller --- net/llc/llc_sap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index d90928f50226..a7f7b8ff4729 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -394,8 +394,9 @@ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { - int i = 0, count = 256 / sizeof(struct sock *); - struct sock *sk, *stack[count]; + int i = 0; + struct sock *sk; + struct sock *stack[256 / sizeof(struct sock *)]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); @@ -408,7 +409,7 @@ static void llc_sap_mcast(struct llc_sap *sap, continue; sock_hold(sk); - if (i < count) + if (i < ARRAY_SIZE(stack)) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); -- cgit v1.2.3-70-g09d2 From de8d5ab2ff6edc8e26822965a30b6aa4e9332025 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 12 Mar 2018 11:48:49 +0200 Subject: net: Make RX-FCS and HW GRO mutually exclusive Same as LRO, hardware GRO cannot be enabled with RX-FCS. When both are requested, hardware GRO will be dropped. Suggested-by: David Miller Signed-off-by: Gal Pressman Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 259abb1515d0..12a9aad0b057 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7549,10 +7549,17 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } - /* LRO feature cannot be combined with RX-FCS */ - if ((features & NETIF_F_LRO) && (features & NETIF_F_RXFCS)) { - netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); - features &= ~NETIF_F_LRO; + /* LRO/HW-GRO features cannot be combined with RX-FCS */ + if (features & NETIF_F_RXFCS) { + if (features & NETIF_F_LRO) { + netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_LRO; + } + + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n"); + features &= ~NETIF_F_GRO_HW; + } } return features; -- cgit v1.2.3-70-g09d2 From f1cb9d68b4d8789d7f48bd293772095430860d86 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Sun, 11 Mar 2018 22:07:49 +0100 Subject: net: rds: drop VLA in rds_for_each_conn_info() Avoid VLA[1] by using an already allocated buffer passed by the caller. [1] https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Salvatore Mesoraca Signed-off-by: David S. Miller --- net/rds/connection.c | 2 +- net/rds/ib.c | 3 +++ net/rds/rds.h | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 2da3176bf792..f80792c719eb 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -540,9 +540,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), + u64 *buffer, size_t item_len) { - uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; struct rds_connection *conn; size_t i; diff --git a/net/rds/ib.c b/net/rds/ib.c index 50a88f3e7e39..02deee29e7f1 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -321,8 +321,11 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { + u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8]; + rds_for_each_conn_info(sock, len, iter, lens, rds_ib_conn_info_visitor, + buffer, sizeof(struct rds_info_rdma_connection)); } diff --git a/net/rds/rds.h b/net/rds/rds.h index 74cd27c661de..b04c333d9d1c 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -735,6 +735,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), + u64 *buffer, size_t item_len); __printf(2, 3) -- cgit v1.2.3-70-g09d2 From b2c9272ae7d366e85ab277f528c3dddb7f268b12 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Sun, 11 Mar 2018 22:07:50 +0100 Subject: net: rds: drop VLA in rds_walk_conn_path_info() Avoid VLA[1] by using an already allocated buffer passed by the caller. [1] https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Salvatore Mesoraca Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index f80792c719eb..abef75da89a7 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -578,9 +578,9 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_conn_path *, void *), + u64 *buffer, size_t item_len) { - u64 buffer[(item_len + 7) / 8]; struct hlist_head *head; struct rds_connection *conn; size_t i; @@ -649,8 +649,11 @@ static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { + u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8]; + rds_walk_conn_path_info(sock, len, iter, lens, rds_conn_info_visitor, + buffer, sizeof(struct rds_info_connection)); } -- cgit v1.2.3-70-g09d2 From d98985dd6c2dc69e2ad5f5482a5237fb9487ed99 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 13 Mar 2018 03:03:30 +0000 Subject: sctp: fix error return code in sctp_sendmsg_new_asoc() Return error code -EINVAL in the address len check error handling case since 'err' can be overwrite to 0 by 'err = sctp_verify_addr()' in the for loop. Fixes: 2c0dbaa0c43d ("sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg") Signed-off-by: Wei Yongjun Acked-by: Neil Horman Reviewed-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7d3476a4860d..af5cf29b0c65 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1677,7 +1677,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, struct sctp_association *asoc; enum sctp_scope scope; struct cmsghdr *cmsg; - int err = -EINVAL; + int err; *tp = NULL; @@ -1761,16 +1761,20 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags, memset(daddr, 0, sizeof(*daddr)); dlen = cmsg->cmsg_len - sizeof(struct cmsghdr); if (cmsg->cmsg_type == SCTP_DSTADDRV4) { - if (dlen < sizeof(struct in_addr)) + if (dlen < sizeof(struct in_addr)) { + err = -EINVAL; goto free; + } dlen = sizeof(struct in_addr); daddr->v4.sin_family = AF_INET; daddr->v4.sin_port = htons(asoc->peer.port); memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen); } else { - if (dlen < sizeof(struct in6_addr)) + if (dlen < sizeof(struct in6_addr)) { + err = -EINVAL; goto free; + } dlen = sizeof(struct in6_addr); daddr->v6.sin6_family = AF_INET6; -- cgit v1.2.3-70-g09d2 From 2e01ae0ef2db96c530249563383f479f8d9ca183 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Mar 2018 13:36:51 +0300 Subject: net: Convert sctp_defaults_ops These pernet_operations have a deal with sysctl, /proc entries and statistics. Also, there are freeing of net::sctp::addr_waitq queue and net::sctp::local_addr_list in exit method. All of them look pernet-divided, and it seems these items are only interesting for sctp_defaults_ops, which are safe to be executed in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/sctp/protocol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 91813e686c67..32be52304f98 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1330,6 +1330,7 @@ static void __net_exit sctp_defaults_exit(struct net *net) static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, + .async = true, }; static int __net_init sctp_ctrlsock_init(struct net *net) -- cgit v1.2.3-70-g09d2 From bfdfa38ff0e26463e418235c0be03c0f0eca4569 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Mar 2018 13:37:02 +0300 Subject: net: Convert sctp_ctrlsock_ops These pernet_operations create and destroy net::sctp::ctl_sock. Since pernet_operations do not send sctp packets each other, they look safe to be marked as async. Signed-off-by: Kirill Tkhai Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/protocol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 32be52304f98..606361ee9e4a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1354,6 +1354,7 @@ static void __net_init sctp_ctrlsock_exit(struct net *net) static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, + .async = true, }; /* Initialize the universe into something sensible. */ -- cgit v1.2.3-70-g09d2 From afbbc374ab1281f2e5c18278a62a47fb906f0fa4 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Mar 2018 13:37:11 +0300 Subject: net: Convert tipc_net_ops TIPC looks concentrated in itself, and other pernet_operations seem not touching its entities. tipc_net_ops look pernet-divided, and they should be safe to be executed in parallel for several net the same time. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/tipc/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index 0b982d048fb9..04fd91bb11d7 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -105,6 +105,7 @@ static struct pernet_operations tipc_net_ops = { .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), + .async = true, }; static int __init tipc_init(void) -- cgit v1.2.3-70-g09d2 From c939a5e4d597d635dee4eab2f27308d50fa606f7 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Mar 2018 13:37:21 +0300 Subject: net: Convert rds_tcp_net_ops These pernet_operations create and destroy sysctl table and listen socket. Also, exit method flushes global workqueue and work. Everything looks per-net safe, so we can mark them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/rds/tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 08230a145042..eb04e7fa2467 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -515,6 +515,7 @@ static struct pernet_operations rds_tcp_net_ops = { .exit = rds_tcp_exit_net, .id = &rds_tcp_netid, .size = sizeof(struct rds_tcp_net), + .async = true, }; static void rds_tcp_kill_sock(struct net *net) -- cgit v1.2.3-70-g09d2 From 72597135cdd2fe524f9a185d7f954c2c3980f3ee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Mar 2018 08:26:00 +0100 Subject: netfilter: x_tables: fix build with CONFIG_COMPAT=n I placed the helpers within CONFIG_COMPAT section, move them outside. Fixes: 472ebdcd15ebdb ("netfilter: x_tables: check error target size too") Fixes: 07a9da51b4b6ae ("netfilter: x_tables: check standard verdicts in core") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 62 ++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 7521e8a72c06..bac932f1c582 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -577,6 +577,37 @@ error: } EXPORT_SYMBOL(xt_check_table_hooks); +static bool verdict_ok(int verdict) +{ + if (verdict > 0) + return true; + + if (verdict < 0) { + int v = -verdict - 1; + + if (verdict == XT_RETURN) + return true; + + switch (v) { + case NF_ACCEPT: return true; + case NF_DROP: return true; + case NF_QUEUE: return true; + default: + break; + } + + return false; + } + + return false; +} + +static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, + const char *msg, unsigned int msglen) +{ + return usersize == kernsize && strnlen(msg, msglen) < msglen; +} + #ifdef CONFIG_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { @@ -736,37 +767,6 @@ struct compat_xt_error_target { char errorname[XT_FUNCTION_MAXNAMELEN]; }; -static bool verdict_ok(int verdict) -{ - if (verdict > 0) - return true; - - if (verdict < 0) { - int v = -verdict - 1; - - if (verdict == XT_RETURN) - return true; - - switch (v) { - case NF_ACCEPT: return true; - case NF_DROP: return true; - case NF_QUEUE: return true; - default: - break; - } - - return false; - } - - return false; -} - -static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, - const char *msg, unsigned int msglen) -{ - return usersize == kernsize && strnlen(msg, msglen) < msglen; -} - int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset) -- cgit v1.2.3-70-g09d2 From a55efe1d416c345a73bef38848e1ac7109560e12 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 5 Mar 2018 15:35:57 -0600 Subject: ipvs: use true and false for boolean values Assign true or false to boolean variables instead of an integer value. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_lblc.c | 4 ++-- net/netfilter/ipvs/ip_vs_lblcr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 6a340c94c4b8..942e835caf7f 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc) int i; spin_lock_bh(&svc->sched_lock); - tbl->dead = 1; + tbl->dead = true; for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblc_del(en); @@ -369,7 +369,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; - tbl->dead = 0; + tbl->dead = false; tbl->svc = svc; /* diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0627881128da..a5acab25c36b 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -404,7 +404,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc) struct hlist_node *next; spin_lock_bh(&svc->sched_lock); - tbl->dead = 1; + tbl->dead = true; for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) { hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) { ip_vs_lblcr_free(en); @@ -532,7 +532,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; tbl->rover = 0; tbl->counter = 1; - tbl->dead = 0; + tbl->dead = false; tbl->svc = svc; /* -- cgit v1.2.3-70-g09d2 From a870a02cc963de35452bbed932560ed69725c4f2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Mar 2018 21:58:39 +0100 Subject: pktgen: use dynamic allocation for debug print buffer After the removal of the VLA, we get a harmless warning about a large stack frame: net/core/pktgen.c: In function 'pktgen_if_write': net/core/pktgen.c:1710:1: error: the frame size of 1076 bytes is larger than 1024 bytes [-Werror=frame-larger-than=] The function was previously shown to be safe despite hitting the 1024 bye warning level. To get rid of the annoyging warning, while keeping it readable, this changes it to use strndup_user(). Obviously this is not a fast path, so the kmalloc() overhead can be disregarded. Fixes: 35951393bbff ("pktgen: Remove VLA usage") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/core/pktgen.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e2d6ae3b290b..fd65761e1fed 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -906,13 +906,14 @@ static ssize_t pktgen_if_write(struct file *file, i += len; if (debug) { - size_t copy = min_t(size_t, count, 1023); - char tb[1024]; - if (copy_from_user(tb, user_buffer, copy)) - return -EFAULT; - tb[copy] = 0; - pr_debug("%s,%lu buffer -:%s:-\n", - name, (unsigned long)count, tb); + size_t copy = min_t(size_t, count + 1, 1024); + char *tp = strndup_user(user_buffer, copy); + + if (IS_ERR(tp)) + return PTR_ERR(tp); + + pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp); + kfree(buf); } if (!strcmp(name, "min_pkt_size")) { -- cgit v1.2.3-70-g09d2 From 41aeefcc38a2643a62db46818a70a781efb40d99 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 13 Mar 2018 11:41:12 +0100 Subject: batman-adv: add DAT cache netlink support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dump the list of DAT cache entries via the netlink socket. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 20 +++++ net/batman-adv/distributed-arp-table.c | 152 +++++++++++++++++++++++++++++++++ net/batman-adv/distributed-arp-table.h | 8 ++ net/batman-adv/netlink.c | 76 ++++++++++------- 4 files changed, 223 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 56ae28934070..95ab5dbd09fa 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -272,6 +272,21 @@ enum batadv_nl_attrs { */ BATADV_ATTR_BLA_CRC, + /** + * @BATADV_ATTR_DAT_CACHE_IP4ADDRESS: Client IPv4 address + */ + BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_HWADDRESS: Client MAC address + */ + BATADV_ATTR_DAT_CACHE_HWADDRESS, + + /** + * @BATADV_ATTR_DAT_CACHE_VID: VLAN ID + */ + BATADV_ATTR_DAT_CACHE_VID, + /* add attributes above here, update the policy in netlink.c */ /** @@ -361,6 +376,11 @@ enum batadv_nl_commands { */ BATADV_CMD_GET_BLA_BACKBONE, + /** + * @BATADV_CMD_GET_DAT_CACHE: Query list of DAT cache entries + */ + BATADV_CMD_GET_DAT_CACHE, + /* add new commands above here */ /** diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 4469dcc1558f..75dda9454ccf 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -43,13 +44,19 @@ #include #include #include +#include +#include +#include +#include #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" #include "originator.h" #include "send.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -851,6 +858,151 @@ out: } #endif +/** + * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a + * netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @dat_entry: entry to dump + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_dat_entry *dat_entry) +{ + int msecs; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE); + if (!hdr) + return -ENOBUFS; + + msecs = jiffies_to_msecs(jiffies - dat_entry->last_update); + + if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS, + dat_entry->ip) || + nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN, + dat_entry->mac_addr) || + nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) || + nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to + * a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, int *idx_skip) +{ + struct batadv_dat_entry *dat_entry; + int idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(dat_entry, head, hash_entry) { + if (idx < *idx_skip) + goto skip; + + if (batadv_dat_cache_dump_entry(msg, portid, seq, + dat_entry)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_hashtable *hash; + struct batadv_priv *bat_priv; + int bucket = cb->args[0]; + struct hlist_head *head; + int idx = cb->args[1]; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + hash = bat_priv->dat.hash; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + + while (bucket < hash->size) { + head = &hash->table[bucket]; + + if (batadv_dat_cache_dump_bucket(msg, portid, + cb->nlh->nlmsg_seq, head, + &idx)) + break; + + bucket++; + idx = 0; + } + + cb->args[0] = bucket; + cb->args[1] = idx; + + ret = msg->len; + +out: + if (primary_if) + batadv_hardif_put(primary_if); + + if (soft_iface) + dev_put(soft_iface); + + return ret; +} + /** * batadv_arp_get_type() - parse an ARP packet and gets the type * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index e24aa9601c52..a04596028337 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -28,6 +28,7 @@ #include "originator.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -81,6 +82,7 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv, int batadv_dat_init(struct batadv_priv *bat_priv); void batadv_dat_free(struct batadv_priv *bat_priv); int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); +int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb); /** * batadv_dat_inc_counter() - increment the correct DAT packet counter @@ -169,6 +171,12 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv) { } +static inline int +batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, u8 subtype) { diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 129af56b944d..2b3e7c3f87fa 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -45,6 +45,7 @@ #include "bat_algo.h" #include "bridge_loop_avoidance.h" +#include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" #include "originator.h" @@ -64,39 +65,42 @@ static const struct genl_multicast_group batadv_netlink_mcgrps[] = { }; static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { - [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, - [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, - [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, - [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, - [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, - [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, - [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, - [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, - [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, - [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, - [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, - [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, - [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, - [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_TQ] = { .type = NLA_U8 }, - [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, - [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, - [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, - [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, - [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, - [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_VERSION] = { .type = NLA_STRING }, + [BATADV_ATTR_ALGO_NAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_MESH_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_MESH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_HARD_IFINDEX] = { .type = NLA_U32 }, + [BATADV_ATTR_HARD_IFNAME] = { .type = NLA_STRING }, + [BATADV_ATTR_HARD_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_ORIG_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TPMETER_RESULT] = { .type = NLA_U8 }, + [BATADV_ATTR_TPMETER_TEST_TIME] = { .type = NLA_U32 }, + [BATADV_ATTR_TPMETER_BYTES] = { .type = NLA_U64 }, + [BATADV_ATTR_TPMETER_COOKIE] = { .type = NLA_U32 }, + [BATADV_ATTR_ACTIVE] = { .type = NLA_FLAG }, + [BATADV_ATTR_TT_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TT_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_LAST_TTVN] = { .type = NLA_U8 }, + [BATADV_ATTR_TT_CRC32] = { .type = NLA_U32 }, + [BATADV_ATTR_TT_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_TT_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_FLAG_BEST] = { .type = NLA_FLAG }, + [BATADV_ATTR_LAST_SEEN_MSECS] = { .type = NLA_U32 }, + [BATADV_ATTR_NEIGH_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_TQ] = { .type = NLA_U8 }, + [BATADV_ATTR_THROUGHPUT] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_ROUTER] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_OWN] = { .type = NLA_FLAG }, + [BATADV_ATTR_BLA_ADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_BLA_BACKBONE] = { .len = ETH_ALEN }, + [BATADV_ATTR_BLA_CRC] = { .type = NLA_U16 }, + [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, + [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, + [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, }; /** @@ -604,6 +608,12 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_bla_backbone_dump, }, + { + .cmd = BATADV_CMD_GET_DAT_CACHE, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_dat_cache_dump, + }, }; -- cgit v1.2.3-70-g09d2 From 53dd9a68ba683986ec90497586f94b941bb748a0 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 13 Mar 2018 11:41:13 +0100 Subject: batman-adv: add multicast flags netlink support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dump the list of multicast flags entries via the netlink socket. Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 62 +++++++++++ net/batman-adv/multicast.c | 237 ++++++++++++++++++++++++++++++++++++++++ net/batman-adv/multicast.h | 18 +++ net/batman-adv/netlink.c | 12 ++ 4 files changed, 329 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 95ab5dbd09fa..324a0e1143e7 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -91,6 +91,53 @@ enum batadv_tt_client_flags { BATADV_TT_CLIENT_TEMP = (1 << 11), }; +/** + * enum batadv_mcast_flags_priv - Private, own multicast flags + * + * These are internal, multicast related flags. Currently they describe certain + * multicast related attributes of the segment this originator bridges into the + * mesh. + * + * Those attributes are used to determine the public multicast flags this + * originator is going to announce via TT. + * + * For netlink, if BATADV_MCAST_FLAGS_BRIDGED is unset then all querier + * related flags are undefined. + */ +enum batadv_mcast_flags_priv { + /** + * @BATADV_MCAST_FLAGS_BRIDGED: There is a bridge on top of the mesh + * interface. + */ + BATADV_MCAST_FLAGS_BRIDGED = (1 << 0), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS: Whether an IGMP querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS = (1 << 1), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS: Whether an MLD querier + * exists in the mesh + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS = (1 << 2), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING: If an IGMP querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING = (1 << 3), + + /** + * @BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING: If an MLD querier + * exists, whether it is potentially shadowing multicast listeners + * (i.e. querier is behind our own bridge segment) + */ + BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING = (1 << 4), +}; + /** * enum batadv_nl_attrs - batman-adv netlink attributes */ @@ -287,6 +334,16 @@ enum batadv_nl_attrs { */ BATADV_ATTR_DAT_CACHE_VID, + /** + * @BATADV_ATTR_MCAST_FLAGS: Per originator multicast flags + */ + BATADV_ATTR_MCAST_FLAGS, + + /** + * @BATADV_ATTR_MCAST_FLAGS_PRIV: Private, own multicast flags + */ + BATADV_ATTR_MCAST_FLAGS_PRIV, + /* add attributes above here, update the policy in netlink.c */ /** @@ -381,6 +438,11 @@ enum batadv_nl_commands { */ BATADV_CMD_GET_DAT_CACHE, + /** + * @BATADV_CMD_GET_MCAST_FLAGS: Query list of multicast flags + */ + BATADV_CMD_GET_MCAST_FLAGS, + /* add new commands above here */ /** diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index c7a1305ca7e7..5615b6abea6f 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -52,14 +53,20 @@ #include #include #include +#include #include #include #include +#include +#include #include +#include #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "netlink.h" +#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -1333,6 +1340,236 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) } #endif +/** + * batadv_mcast_mesh_info_put() - put multicast info into a netlink message + * @msg: buffer for the message + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 or error code. + */ +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv) +{ + u32 flags = bat_priv->mcast.flags; + u32 flags_priv = BATADV_NO_FLAGS; + + if (bat_priv->mcast.bridged) { + flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; + + if (bat_priv->mcast.querier_ipv4.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; + if (bat_priv->mcast.querier_ipv6.exists) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; + if (bat_priv->mcast.querier_ipv4.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; + if (bat_priv->mcast.querier_ipv6.shadowing) + flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; + } + + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || + nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) + return -EMSGSIZE; + + return 0; +} + +/** + * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table + * to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @orig_node: originator to dump the multicast flags of + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_orig_node *orig_node) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, + NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); + if (!hdr) + return -ENOBUFS; + + if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, + orig_node->orig)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + + if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capabilities)) { + if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, + orig_node->mcast_flags)) { + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; + } + } + + genlmsg_end(msg, hdr); + return 0; +} + +/** + * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags + * table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @head: bucket to dump + * @idx_skip: How many entries to skip + * + * Return: 0 or error code. + */ +static int +batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, + struct hlist_head *head, long *idx_skip) +{ + struct batadv_orig_node *orig_node; + long idx = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(orig_node, head, hash_entry) { + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig_node->capa_initialized)) + continue; + + if (idx < *idx_skip) + goto skip; + + if (batadv_mcast_flags_dump_entry(msg, portid, seq, + orig_node)) { + rcu_read_unlock(); + *idx_skip = idx; + + return -EMSGSIZE; + } + +skip: + idx++; + } + rcu_read_unlock(); + + return 0; +} + +/** + * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @portid: netlink port + * @seq: Sequence number of netlink message + * @bat_priv: the bat priv with all the soft interface information + * @bucket: current bucket to dump + * @idx: index in current bucket to the next entry to dump + * + * Return: 0 or error code. + */ +static int +__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq, + struct batadv_priv *bat_priv, long *bucket, long *idx) +{ + struct batadv_hashtable *hash = bat_priv->orig_hash; + long bucket_tmp = *bucket; + struct hlist_head *head; + long idx_tmp = *idx; + + while (bucket_tmp < hash->size) { + head = &hash->table[bucket_tmp]; + + if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head, + &idx_tmp)) + break; + + bucket_tmp++; + idx_tmp = 0; + } + + *bucket = bucket_tmp; + *idx = idx_tmp; + + return msg->len; +} + +/** + * batadv_mcast_netlink_get_primary() - get primary interface from netlink + * callback + * @cb: netlink callback structure + * @primary_if: the primary interface pointer to return the result in + * + * Return: 0 or error code. + */ +static int +batadv_mcast_netlink_get_primary(struct netlink_callback *cb, + struct batadv_hard_iface **primary_if) +{ + struct batadv_hard_iface *hard_iface = NULL; + struct net *net = sock_net(cb->skb->sk); + struct net_device *soft_iface; + struct batadv_priv *bat_priv; + int ifindex; + int ret = 0; + + ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return -EINVAL; + + soft_iface = dev_get_by_index(net, ifindex); + if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { + ret = -ENODEV; + goto out; + } + + bat_priv = netdev_priv(soft_iface); + + hard_iface = batadv_primary_if_get_selected(bat_priv); + if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out; + } + +out: + if (soft_iface) + dev_put(soft_iface); + + if (!ret && primary_if) + *primary_if = hard_iface; + else + batadv_hardif_put(hard_iface); + + return ret; +} + +/** + * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket + * @msg: buffer for the message + * @cb: callback structure containing arguments + * + * Return: message length. + */ +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) +{ + struct batadv_hard_iface *primary_if = NULL; + int portid = NETLINK_CB(cb->skb).portid; + struct batadv_priv *bat_priv; + long *bucket = &cb->args[0]; + long *idx = &cb->args[1]; + int ret; + + ret = batadv_mcast_netlink_get_primary(cb, &primary_if); + if (ret) + return ret; + + bat_priv = netdev_priv(primary_if->soft_iface); + ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq, + bat_priv, bucket, idx); + + batadv_hardif_put(primary_if); + return ret; +} + /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 6b8594e23da3..3b04ab13f0eb 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -21,6 +21,7 @@ #include "main.h" +struct netlink_callback; struct seq_file; struct sk_buff; @@ -54,6 +55,11 @@ void batadv_mcast_init(struct batadv_priv *bat_priv); int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset); +int batadv_mcast_mesh_info_put(struct sk_buff *msg, + struct batadv_priv *bat_priv); + +int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb); + void batadv_mcast_free(struct batadv_priv *bat_priv); void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node); @@ -72,6 +78,18 @@ static inline int batadv_mcast_init(struct batadv_priv *bat_priv) return 0; } +static inline int +batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) +{ + return 0; +} + +static inline int batadv_mcast_flags_dump(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + static inline void batadv_mcast_free(struct batadv_priv *bat_priv) { } diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 2b3e7c3f87fa..0d9459b69bdb 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -48,6 +48,7 @@ #include "distributed-arp-table.h" #include "gateway_client.h" #include "hard-interface.h" +#include "multicast.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" @@ -101,6 +102,8 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_DAT_CACHE_IP4ADDRESS] = { .type = NLA_U32 }, [BATADV_ATTR_DAT_CACHE_HWADDRESS] = { .len = ETH_ALEN }, [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, + [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, + [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, }; /** @@ -151,6 +154,9 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) goto out; #endif + if (batadv_mcast_mesh_info_put(msg, bat_priv)) + goto out; + primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { hard_iface = primary_if->net_dev; @@ -614,6 +620,12 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_dat_cache_dump, }, + { + .cmd = BATADV_CMD_GET_MCAST_FLAGS, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .dumpit = batadv_mcast_flags_dump, + }, }; -- cgit v1.2.3-70-g09d2 From 29d1df72ce2ac187d457990fe445a16212dcfa19 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 14 Mar 2018 03:07:27 -0500 Subject: pktgen: Fix memory leak in pktgen_if_write _buf_ is an array and the one that must be freed is _tp_ instead. Fixes: a870a02cc963 ("pktgen: use dynamic allocation for debug print buffer") Reported-by: Wang Jian Signed-off-by: Gustavo A. R. Silva Acked-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fd65761e1fed..545cf08cd558 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -913,7 +913,7 @@ static ssize_t pktgen_if_write(struct file *file, return PTR_ERR(tp); pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp); - kfree(buf); + kfree(tp); } if (!strcmp(name, "min_pkt_size")) { -- cgit v1.2.3-70-g09d2 From ced68234b6a244355aeab07c2681bc49f695eaed Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 14 Mar 2018 12:49:19 -0400 Subject: sock: remove zerocopy sockopt restriction on closed tcp state Socket option SO_ZEROCOPY determines whether the kernel ignores or processes flag MSG_ZEROCOPY on subsequent send calls. This to avoid changing behavior for legacy processes. Limiting the state change to closed sockets is annoying with passive sockets and not necessary for correctness. Once created, zerocopy skbs are processed based on their private state, not this socket flag. Remove the constraint. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 5 ----- net/core/sock.c | 2 -- 2 files changed, 7 deletions(-) (limited to 'net') diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index 291a01264967..fe46d4867e2d 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -72,11 +72,6 @@ this flag, a process must first signal intent by setting a socket option: if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) error(1, errno, "setsockopt zerocopy"); -Setting the socket option only works when the socket is in its initial -(TCP_CLOSED) state. Trying to set the option for a socket returned by accept(), -for example, will lead to an EBUSY error. In this case, the option should be set -to the listening socket and it will be inherited by the accepted sockets. - Transmission ------------ diff --git a/net/core/sock.c b/net/core/sock.c index 27f218bba43f..a8962d912895 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1052,8 +1052,6 @@ set_rcvbuf: if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { if (sk->sk_protocol != IPPROTO_TCP) ret = -ENOTSUPP; - else if (sk->sk_state != TCP_CLOSE) - ret = -EBUSY; } else if (sk->sk_family != PF_RDS) { ret = -ENOTSUPP; } -- cgit v1.2.3-70-g09d2 From c9f4c6cf53bfafb639386a4c094929f13f573e04 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 14 Mar 2018 11:01:00 +0100 Subject: net/smc: pay attention to MAX_ORDER for CQ entries smc allocates a certain number of CQ entries for used RoCE devices. For mlx5 devices the chosen constant number results in a large allocation causing this warning: [13355.124656] WARNING: CPU: 3 PID: 16535 at mm/page_alloc.c:3883 __alloc_pages_nodemask+0x2be/0x10c0 [13355.124657] Modules linked in: smc_diag(O) smc(O) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ip6table_filter ip6_tables iptable_filter mlx5_ib ib_core sunrpc mlx5_core s390_trng rng_core ghash_s390 prng aes_s390 des_s390 des_generic sha512_s390 sha256_s390 sha1_s390 sha_common ptp pps_core eadm_sch dm_multipath dm_mod vhost_net tun vhost tap sch_fq_codel kvm ip_tables x_tables autofs4 [last unloaded: smc] [13355.124672] CPU: 3 PID: 16535 Comm: kworker/3:0 Tainted: G O 4.14.0uschi #1 [13355.124673] Hardware name: IBM 3906 M04 704 (LPAR) [13355.124675] Workqueue: events smc_listen_work [smc] [13355.124677] task: 00000000e2f22100 task.stack: 0000000084720000 [13355.124678] Krnl PSW : 0704c00180000000 000000000029da76 (__alloc_pages_nodemask+0x2be/0x10c0) [13355.124681] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 [13355.124682] Krnl GPRS: 0000000000000000 00550e00014080c0 0000000000000000 0000000000000001 [13355.124684] 000000000029d8b6 00000000f3bfd710 0000000000000000 00000000014080c0 [13355.124685] 0000000000000009 00000000ec277a00 0000000000200000 0000000000000000 [13355.124686] 0000000000000000 00000000000001ff 000000000029d8b6 0000000084723720 [13355.124708] Krnl Code: 000000000029da6a: a7110200 tmll %r1,512 000000000029da6e: a774ff29 brc 7,29d8c0 #000000000029da72: a7f40001 brc 15,29da74 >000000000029da76: a7f4ff25 brc 15,29d8c0 000000000029da7a: a7380000 lhi %r3,0 000000000029da7e: a7f4fef1 brc 15,29d860 000000000029da82: 5820f0c4 l %r2,196(%r15) 000000000029da86: a53e0048 llilh %r3,72 [13355.124720] Call Trace: [13355.124722] ([<000000000029d8b6>] __alloc_pages_nodemask+0xfe/0x10c0) [13355.124724] [<000000000013bd1e>] s390_dma_alloc+0x6e/0x148 [13355.124733] [<000003ff802eeba6>] mlx5_dma_zalloc_coherent_node+0x8e/0xe0 [mlx5_core] [13355.124740] [<000003ff802eee18>] mlx5_buf_alloc_node+0x70/0x108 [mlx5_core] [13355.124744] [<000003ff804eb410>] mlx5_ib_create_cq+0x558/0x898 [mlx5_ib] [13355.124749] [<000003ff80407d40>] ib_create_cq+0x48/0x88 [ib_core] [13355.124751] [<000003ff80109fba>] smc_ib_setup_per_ibdev+0x52/0x118 [smc] [13355.124753] [<000003ff8010bcb6>] smc_conn_create+0x65e/0x728 [smc] [13355.124755] [<000003ff801081a2>] smc_listen_work+0x2d2/0x540 [smc] [13355.124756] [<0000000000162c66>] process_one_work+0x1be/0x440 [13355.124758] [<0000000000162f40>] worker_thread+0x58/0x458 [13355.124759] [<0000000000169e7e>] kthread+0x14e/0x168 [13355.124760] [<00000000009ce8be>] kernel_thread_starter+0x6/0xc [13355.124762] [<00000000009ce8b8>] kernel_thread_starter+0x0/0xc [13355.124762] Last Breaking-Event-Address: [13355.124764] [<000000000029da72>] __alloc_pages_nodemask+0x2ba/0x10c0 [13355.124764] ---[ end trace 34be38b581c0b585 ]--- This patch reduces the smc constant for the maximum number of allocated completion queue entries SMC_MAX_CQE by 2 to avoid high round up values in the mlx5 code, and reduces the number of allocated completion queue entries even more, if the final allocation for an mlx5 device hits the MAX_ORDER limit. Reported-by: Ihnken Menssen Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_ib.c | 10 +++++++++- net/smc/smc_wr.h | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 2a8957bd6d38..26df554f7588 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -23,6 +23,8 @@ #include "smc_wr.h" #include "smc.h" +#define SMC_MAX_CQE 32766 /* max. # of completion queue elements */ + #define SMC_QP_MIN_RNR_TIMER 5 #define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */ #define SMC_QP_RETRY_CNT 7 /* 7: infinite */ @@ -438,9 +440,15 @@ out: long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) { struct ib_cq_init_attr cqattr = { - .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 }; + .cqe = SMC_MAX_CQE, .comp_vector = 0 }; + int cqe_size_order, smc_order; long rc; + /* the calculated number of cq entries fits to mlx5 cq allocation */ + cqe_size_order = cache_line_size() == 128 ? 7 : 6; + smc_order = MAX_ORDER - cqe_size_order - 1; + if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) + cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, smc_wr_tx_cq_handler, NULL, smcibdev, &cqattr); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index ef0c3494c9cb..210bec3c3ebe 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,7 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */ #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) -- cgit v1.2.3-70-g09d2 From 268ffcc4ebfc8b10c1502357dfb82ce6af0770ac Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 14 Mar 2018 11:01:01 +0100 Subject: net/smc: free link group without pending free_work only Make sure there is no pending or running free_work worker for the link group when freeing the link group. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 1 + net/smc/smc_core.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2c6f4e0a9f3d..649489f825a5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1477,6 +1477,7 @@ static void __exit smc_exit(void) spin_unlock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { list_del_init(&lgr->list); + cancel_delayed_work_sync(&lgr->free_work); smc_lgr_free(lgr); /* free link group */ } static_branch_disable(&tcp_have_smc); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index f76f60e463cb..ec6189fe2a48 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -140,7 +140,8 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); - smc_lgr_free(lgr); + if (!delayed_work_pending(&lgr->free_work)) + smc_lgr_free(lgr); } /* create a new SMC link group */ -- cgit v1.2.3-70-g09d2 From 97cdbc4213df101228564b8d27090cc1f9a26332 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 14 Mar 2018 11:01:02 +0100 Subject: net/smc: schedule free_work when link group is terminated The free_work worker must be scheduled when the link group is abnormally terminated. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_core.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ec6189fe2a48..f44f6803f7ff 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -32,6 +32,17 @@ static u32 smc_lgr_num; /* unique link group number */ +static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) +{ + /* client link group creation always follows the server link group + * creation. For client use a somewhat higher removal delay time, + * otherwise there is a risk of out-of-sync link groups. + */ + mod_delayed_work(system_wq, &lgr->free_work, + lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : + SMC_LGR_FREE_DELAY_SERV); +} + /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock @@ -111,13 +122,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) write_unlock_bh(&lgr->conns_lock); if (!reduced || lgr->conns_num) return; - /* client link group creation always follows the server link group - * creation. For client use a somewhat higher removal delay time, - * otherwise there is a risk of out-of-sync link groups. - */ - mod_delayed_work(system_wq, &lgr->free_work, - lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : - SMC_LGR_FREE_DELAY_SERV); + smc_lgr_schedule_free_work(lgr); } static void smc_lgr_free_work(struct work_struct *work) @@ -344,6 +349,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr) } write_unlock_bh(&lgr->conns_lock); wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); + smc_lgr_schedule_free_work(lgr); } /* Determine vlan of internal TCP socket. -- cgit v1.2.3-70-g09d2 From 1b1e0bc9947427ae58bbe7de0ce9cfd591b589b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:30 +0800 Subject: sctp: add refcnt support for sh_key With refcnt support for sh_key, chunks auth sh_keys can be decided before enqueuing it. Changing the active key later will not affect the chunks already enqueued. Furthermore, this is necessary when adding the support for authinfo for sendmsg in next patch. Note that struct sctp_chunk can't be grown due to that performance drop issue on slow cpu, so it just reuses head_skb memory for shkey in sctp_chunk. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/auth.h | 9 +++-- include/net/sctp/sm.h | 3 +- include/net/sctp/structs.h | 9 +++-- net/sctp/auth.c | 86 +++++++++++++++++++++++----------------------- net/sctp/chunk.c | 5 +++ net/sctp/output.c | 18 ++++++++-- net/sctp/sm_make_chunk.c | 15 ++++++-- net/sctp/sm_statefuns.c | 11 +++--- net/sctp/socket.c | 6 ++++ 9 files changed, 104 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index e5c57d0a082d..017c1aa3a2c9 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -62,8 +62,9 @@ struct sctp_auth_bytes { /* Definition for a shared key, weather endpoint or association */ struct sctp_shared_key { struct list_head key_list; - __u16 key_id; struct sctp_auth_bytes *key; + refcount_t refcnt; + __u16 key_id; }; #define key_for_each(__key, __list_head) \ @@ -103,8 +104,10 @@ int sctp_auth_send_cid(enum sctp_cid chunk, int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc); void sctp_auth_calculate_hmac(const struct sctp_association *asoc, - struct sk_buff *skb, - struct sctp_auth_chunk *auth, gfp_t gfp); + struct sk_buff *skb, struct sctp_auth_chunk *auth, + struct sctp_shared_key *ep_key, gfp_t gfp); +void sctp_auth_shkey_release(struct sctp_shared_key *sh_key); +void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key); /* API Helpers */ int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id); diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 2883c43c5258..2d0e782c9055 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -263,7 +263,8 @@ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist); -struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc); +struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, + __u16 key_id); struct sctp_chunk *sctp_make_strreset_req(const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index ec6e46b7e119..49ad67bbdbb5 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -577,8 +577,12 @@ struct sctp_chunk { /* This points to the sk_buff containing the actual data. */ struct sk_buff *skb; - /* In case of GSO packets, this will store the head one */ - struct sk_buff *head_skb; + union { + /* In case of GSO packets, this will store the head one */ + struct sk_buff *head_skb; + /* In case of auth enabled, this will point to the shkey */ + struct sctp_shared_key *shkey; + }; /* These are the SCTP headers by reverse order in a packet. * Note that some of these may happen more than once. In that @@ -1995,6 +1999,7 @@ struct sctp_association { * The current generated assocaition shared key (secret) */ struct sctp_auth_bytes *asoc_shared_key; + struct sctp_shared_key *shkey; /* SCTP AUTH: hmac id of the first peer requested algorithm * that we support. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 00667c50efa7..e5214fd7f650 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -101,13 +101,14 @@ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp) return NULL; INIT_LIST_HEAD(&new->key_list); + refcount_set(&new->refcnt, 1); new->key_id = key_id; return new; } /* Free the shared key structure */ -static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key) +static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key) { BUG_ON(!list_empty(&sh_key->key_list)); sctp_auth_key_put(sh_key->key); @@ -115,6 +116,17 @@ static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key) kfree(sh_key); } +void sctp_auth_shkey_release(struct sctp_shared_key *sh_key) +{ + if (refcount_dec_and_test(&sh_key->refcnt)) + sctp_auth_shkey_destroy(sh_key); +} + +void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key) +{ + refcount_inc(&sh_key->refcnt); +} + /* Destroy the entire key list. This is done during the * associon and endpoint free process. */ @@ -128,7 +140,7 @@ void sctp_auth_destroy_keys(struct list_head *keys) key_for_each_safe(ep_key, tmp, keys) { list_del_init(&ep_key->key_list); - sctp_auth_shkey_free(ep_key); + sctp_auth_shkey_release(ep_key); } } @@ -409,13 +421,19 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp) sctp_auth_key_put(asoc->asoc_shared_key); asoc->asoc_shared_key = secret; + asoc->shkey = ep_key; /* Update send queue in case any chunk already in there now * needs authenticating */ list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) { - if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) + if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) { chunk->auth = 1; + if (!chunk->shkey) { + chunk->shkey = asoc->shkey; + sctp_auth_shkey_hold(chunk->shkey); + } + } } return 0; @@ -703,16 +721,15 @@ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc) * after the AUTH chunk in the SCTP packet. */ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, - struct sk_buff *skb, - struct sctp_auth_chunk *auth, - gfp_t gfp) + struct sk_buff *skb, struct sctp_auth_chunk *auth, + struct sctp_shared_key *ep_key, gfp_t gfp) { - struct crypto_shash *tfm; struct sctp_auth_bytes *asoc_key; + struct crypto_shash *tfm; __u16 key_id, hmac_id; - __u8 *digest; unsigned char *end; int free_key = 0; + __u8 *digest; /* Extract the info we need: * - hmac id @@ -724,12 +741,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, if (key_id == asoc->active_key_id) asoc_key = asoc->asoc_shared_key; else { - struct sctp_shared_key *ep_key; - - ep_key = sctp_auth_get_shkey(asoc, key_id); - if (!ep_key) - return; - + /* ep_key can't be NULL here */ asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp); if (!asoc_key) return; @@ -829,7 +841,7 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key) { - struct sctp_shared_key *cur_key = NULL; + struct sctp_shared_key *cur_key, *shkey; struct sctp_auth_bytes *key; struct list_head *sh_keys; int replace = 0; @@ -842,46 +854,34 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, else sh_keys = &ep->endpoint_shared_keys; - key_for_each(cur_key, sh_keys) { - if (cur_key->key_id == auth_key->sca_keynumber) { + key_for_each(shkey, sh_keys) { + if (shkey->key_id == auth_key->sca_keynumber) { replace = 1; break; } } - /* If we are not replacing a key id, we need to allocate - * a shared key. - */ - if (!replace) { - cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, - GFP_KERNEL); - if (!cur_key) - return -ENOMEM; - } + cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL); + if (!cur_key) + return -ENOMEM; /* Create a new key data based on the info passed in */ key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL); - if (!key) - goto nomem; + if (!key) { + kfree(cur_key); + return -ENOMEM; + } memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength); + cur_key->key = key; - /* If we are replacing, remove the old keys data from the - * key id. If we are adding new key id, add it to the - * list. - */ - if (replace) - sctp_auth_key_put(cur_key->key); - else - list_add(&cur_key->key_list, sh_keys); + if (replace) { + list_del_init(&shkey->key_list); + sctp_auth_shkey_release(shkey); + } + list_add(&cur_key->key_list, sh_keys); - cur_key->key = key; return 0; -nomem: - if (!replace) - sctp_auth_shkey_free(cur_key); - - return -ENOMEM; } int sctp_auth_set_active_key(struct sctp_endpoint *ep, @@ -952,7 +952,7 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep, /* Delete the shared key */ list_del_init(&key->key_list); - sctp_auth_shkey_free(key); + sctp_auth_shkey_release(key); return 0; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 991a530c6b31..9f28a9aae976 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -168,6 +168,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); + struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; @@ -204,6 +205,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); + + shkey = asoc->shkey; } /* Check what's our max considering the above */ @@ -275,6 +278,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (err < 0) goto errout_chunk_free; + chunk->shkey = shkey; + /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); diff --git a/net/sctp/output.c b/net/sctp/output.c index 01a26ee051e3..d6e1c90cc09a 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -241,10 +241,13 @@ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, if (!chunk->auth) return retval; - auth = sctp_make_auth(asoc); + auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; + auth->shkey = chunk->shkey; + sctp_auth_shkey_hold(auth->shkey); + retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) @@ -490,7 +493,8 @@ merge: } if (auth) { - sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp); + sctp_auth_calculate_hmac(tp->asoc, nskb, auth, + packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); @@ -770,6 +774,16 @@ static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; + /* Don't bundle in this packet if this chunk's auth key doesn't + * match other chunks already enqueued on this packet. Also, + * don't bundle the chunk with auth key if other chunks in this + * packet don't have auth key. + */ + if ((packet->auth && chunk->shkey != packet->auth->shkey) || + (!packet->auth && chunk->shkey && + chunk->chunk_hdr->type != SCTP_CID_AUTH)) + return SCTP_XMIT_PMTU_FULL; + psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d01475f5f710..10f071cdf188 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -87,7 +87,10 @@ static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { - /*TODO: do memory release */ + struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; + + if (chunk->shkey) + sctp_auth_shkey_release(chunk->shkey); } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) @@ -102,7 +105,12 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) * * For now don't do anything for now. */ + if (chunk->auth) { + chunk->shkey = asoc->shkey; + sctp_auth_shkey_hold(chunk->shkey); + } skb->sk = asoc ? asoc->base.sk : NULL; + skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } @@ -1271,7 +1279,8 @@ nodata: return retval; } -struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) +struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, + __u16 key_id) { struct sctp_authhdr auth_hdr; struct sctp_hmac *hmac_desc; @@ -1289,7 +1298,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); - auth_hdr.shkey_id = htons(asoc->active_key_id); + auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index eb7905ffe5f2..792e0e2be320 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4114,6 +4114,7 @@ static enum sctp_ierror sctp_sf_authenticate( const union sctp_subtype type, struct sctp_chunk *chunk) { + struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; struct sctp_hmac *hmac; @@ -4135,9 +4136,11 @@ static enum sctp_ierror sctp_sf_authenticate( * configured */ key_id = ntohs(auth_hdr->shkey_id); - if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id)) - return SCTP_IERROR_AUTH_BAD_KEYID; - + if (key_id != asoc->active_key_id) { + sh_key = sctp_auth_get_shkey(asoc, key_id); + if (!sh_key) + return SCTP_IERROR_AUTH_BAD_KEYID; + } /* Make sure that the length of the signature matches what * we expect. @@ -4166,7 +4169,7 @@ static enum sctp_ierror sctp_sf_authenticate( sctp_auth_calculate_hmac(asoc, chunk->skb, (struct sctp_auth_chunk *)chunk->chunk_hdr, - GFP_ATOMIC); + sh_key, GFP_ATOMIC); /* Discard the packet if the digests do not match */ if (memcmp(save_digest, digest, sig_len)) { diff --git a/net/sctp/socket.c b/net/sctp/socket.c index af5cf29b0c65..003a4ad89c01 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -156,6 +156,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk) /* The sndbuf space is tracked per association. */ sctp_association_hold(asoc); + if (chunk->shkey) + sctp_auth_shkey_hold(chunk->shkey); + skb_set_owner_w(chunk->skb, sk); chunk->skb->destructor = sctp_wfree; @@ -8109,6 +8112,9 @@ static void sctp_wfree(struct sk_buff *skb) sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); + if (chunk->shkey) + sctp_auth_shkey_release(chunk->shkey); + sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); -- cgit v1.2.3-70-g09d2 From 3ff547c06a7d75d72d37dae2c064fcf0672e56c0 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:31 +0800 Subject: sctp: add support for SCTP AUTH Information for sendmsg This patch is to add support for SCTP AUTH Information for sendmsg, as described in section 5.3.8 of RFC6458. With this option, you can provide shared key identifier used for sending the user message. It's also a necessary send info for sctp_sendv. Note that it reuses sinfo->sinfo_tsn to indicate if this option is set and sinfo->sinfo_ssn to save the shkey ID which can be 0. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + include/uapi/linux/sctp.h | 14 +++++++++++++- net/sctp/chunk.c | 11 ++++++++++- net/sctp/socket.c | 23 +++++++++++++++++++++++ 4 files changed, 47 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 49ad67bbdbb5..012fb3e2f4cf 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2118,6 +2118,7 @@ struct sctp_cmsgs { struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; + struct sctp_authinfo *authinfo; struct msghdr *addrs_msg; }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index e94b6d297ad9..47e781ebde05 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -273,6 +273,18 @@ struct sctp_prinfo { __u32 pr_value; }; +/* 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ------------------- + * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo + */ +struct sctp_authinfo { + __u16 auth_keynumber; +}; + /* * sinfo_flags: 16 bits (unsigned integer) * @@ -310,7 +322,7 @@ typedef enum sctp_cmsg_type { #define SCTP_NXTINFO SCTP_NXTINFO SCTP_PRINFO, /* 5.3.7 SCTP PR-SCTP Information Structure */ #define SCTP_PRINFO SCTP_PRINFO - SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure (RESERVED) */ + SCTP_AUTHINFO, /* 5.3.8 SCTP AUTH Information Structure */ #define SCTP_AUTHINFO SCTP_AUTHINFO SCTP_DSTADDRV4, /* 5.3.9 SCTP Destination IPv4 Address Structure */ #define SCTP_DSTADDRV4 SCTP_DSTADDRV4 diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 9f28a9aae976..f889a84f264d 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -206,7 +206,16 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); - shkey = asoc->shkey; + if (sinfo->sinfo_tsn && + sinfo->sinfo_ssn != asoc->active_key_id) { + shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); + if (!shkey) { + err = -EINVAL; + goto errout; + } + } else { + shkey = asoc->shkey; + } } /* Check what's our max considering the above */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 003a4ad89c01..9ffdecbc3531 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1987,6 +1987,14 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc, if (!cmsgs->srinfo && !cmsgs->prinfo) sinfo->sinfo_timetolive = asoc->default_timetolive; + + if (cmsgs->authinfo) { + /* Reuse sinfo_tsn to indicate that authinfo was set and + * sinfo_ssn to save the keyid on tx path. + */ + sinfo->sinfo_tsn = 1; + sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber; + } } static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) @@ -7874,6 +7882,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs) if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE) cmsgs->prinfo->pr_value = 0; break; + case SCTP_AUTHINFO: + /* SCTP Socket API Extension + * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO) + * + * This cmsghdr structure specifies SCTP options for sendmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ --------------------- + * IPPROTO_SCTP SCTP_AUTHINFO struct sctp_authinfo + */ + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo))) + return -EINVAL; + + cmsgs->authinfo = CMSG_DATA(cmsg); + break; case SCTP_DSTADDRV4: case SCTP_DSTADDRV6: /* SCTP Socket API Extension -- cgit v1.2.3-70-g09d2 From 601590ec155aadf5daa17a6f63a06d1bba5b5ce9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:32 +0800 Subject: sctp: add sockopt SCTP_AUTH_DEACTIVATE_KEY This patch is to add sockopt SCTP_AUTH_DEACTIVATE_KEY, as described in section 8.3.4 of RFC6458. This set option indicates that the application will no longer send user messages using the indicated key identifier. Note that RFC requires that only deactivated keys that are no longer used by an association can be deleted, but for the backward compatibility, it is not to check deactivated when deleting or replacing one sh_key. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/auth.h | 12 ++++++------ include/uapi/linux/sctp.h | 1 + net/sctp/auth.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- net/sctp/socket.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 017c1aa3a2c9..687e7f80037d 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -65,6 +65,7 @@ struct sctp_shared_key { struct sctp_auth_bytes *key; refcount_t refcnt; __u16 key_id; + __u8 deactivated; }; #define key_for_each(__key, __list_head) \ @@ -113,14 +114,13 @@ void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key); int sctp_auth_ep_add_chunkid(struct sctp_endpoint *ep, __u8 chunk_id); int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, struct sctp_hmacalgo *hmacs); -int sctp_auth_set_key(struct sctp_endpoint *ep, - struct sctp_association *asoc, +int sctp_auth_set_key(struct sctp_endpoint *ep, struct sctp_association *asoc, struct sctp_authkey *auth_key); int sctp_auth_set_active_key(struct sctp_endpoint *ep, - struct sctp_association *asoc, - __u16 key_id); + struct sctp_association *asoc, __u16 key_id); int sctp_auth_del_key_id(struct sctp_endpoint *ep, - struct sctp_association *asoc, - __u16 key_id); + struct sctp_association *asoc, __u16 key_id); +int sctp_auth_deact_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, __u16 key_id); #endif diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 47e781ebde05..08fc313829f4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -99,6 +99,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_RECVRCVINFO 32 #define SCTP_RECVNXTINFO 33 #define SCTP_DEFAULT_SNDINFO 34 +#define SCTP_AUTH_DEACTIVATE_KEY 35 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. diff --git a/net/sctp/auth.c b/net/sctp/auth.c index e5214fd7f650..a073123fc485 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -449,8 +449,11 @@ struct sctp_shared_key *sctp_auth_get_shkey( /* First search associations set of endpoint pair shared keys */ key_for_each(key, &asoc->endpoint_shared_keys) { - if (key->key_id == key_id) - return key; + if (key->key_id == key_id) { + if (!key->deactivated) + return key; + break; + } } return NULL; @@ -905,7 +908,7 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, } } - if (!found) + if (!found || key->deactivated) return -EINVAL; if (asoc) { @@ -956,3 +959,40 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep, return 0; } + +int sctp_auth_deact_key_id(struct sctp_endpoint *ep, + struct sctp_association *asoc, __u16 key_id) +{ + struct sctp_shared_key *key; + struct list_head *sh_keys; + int found = 0; + + /* The key identifier MUST NOT be the current active key + * The key identifier MUST correst to an existing key + */ + if (asoc) { + if (asoc->active_key_id == key_id) + return -EINVAL; + + sh_keys = &asoc->endpoint_shared_keys; + } else { + if (ep->active_key_id == key_id) + return -EINVAL; + + sh_keys = &ep->endpoint_shared_keys; + } + + key_for_each(key, sh_keys) { + if (key->key_id == key_id) { + found = 1; + break; + } + } + + if (!found) + return -EINVAL; + + key->deactivated = 1; + + return 0; +} diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9ffdecbc3531..65cc354c520f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3646,6 +3646,33 @@ static int sctp_setsockopt_del_key(struct sock *sk, } +/* + * 8.3.4 Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY) + * + * This set option will deactivate a shared secret key. + */ +static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval, + unsigned int optlen) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct sctp_authkeyid val; + struct sctp_association *asoc; + + if (!ep->auth_enable) + return -EACCES; + + if (optlen != sizeof(struct sctp_authkeyid)) + return -EINVAL; + if (copy_from_user(&val, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, val.scact_assoc_id); + if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber); +} + /* * 8.1.23 SCTP_AUTO_ASCONF * @@ -4238,6 +4265,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_DELETE_KEY: retval = sctp_setsockopt_del_key(sk, optval, optlen); break; + case SCTP_AUTH_DEACTIVATE_KEY: + retval = sctp_setsockopt_deactivate_key(sk, optval, optlen); + break; case SCTP_AUTO_ASCONF: retval = sctp_setsockopt_auto_asconf(sk, optval, optlen); break; @@ -7212,6 +7242,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, case SCTP_AUTH_KEY: case SCTP_AUTH_CHUNK: case SCTP_AUTH_DELETE_KEY: + case SCTP_AUTH_DEACTIVATE_KEY: retval = -EOPNOTSUPP; break; case SCTP_HMAC_IDENT: -- cgit v1.2.3-70-g09d2 From ec2e506c680deaa8e1a087986db6d73ba63a04be Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:33 +0800 Subject: sctp: add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT This patch is to add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT, as described in section 6.1.8 of RFC6458. SCTP_AUTH_FREE_KEY: This report indicates that the SCTP implementation will no longer use the key identifier specified in auth_keynumber. After deactivating a key, it would never be used again, which means it's refcnt can't be held/increased by new chunks. But there may be some chunks in out queue still using it. So only when refcnt is 1, which means no chunk in outqueue is using/holding this key either, this EVENT would be sent. When users receive this notification, they could do DEL_KEY sockopt to remove this shkey, and also tell the peer that this key won't be used in any chunk thoroughly from now on, then the peer can remove it as well safely. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 6 +++++- net/sctp/auth.c | 14 ++++++++++++++ net/sctp/sm_make_chunk.c | 20 +++++++++++++++++++- net/sctp/sm_statefuns.c | 2 +- net/sctp/socket.c | 19 ++++++++++++++++++- 5 files changed, 57 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 08fc313829f4..18ebbfeee4af 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -518,7 +518,11 @@ struct sctp_authkey_event { sctp_assoc_t auth_assoc_id; }; -enum { SCTP_AUTH_NEWKEY = 0, }; +enum { + SCTP_AUTH_NEW_KEY, +#define SCTP_AUTH_NEWKEY SCTP_AUTH_NEW_KEY /* compatible with before */ + SCTP_AUTH_FREE_KEY, +}; /* * 6.1.9. SCTP_SENDER_DRY_EVENT diff --git a/net/sctp/auth.c b/net/sctp/auth.c index a073123fc485..e64630cd3331 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -992,6 +992,20 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, if (!found) return -EINVAL; + /* refcnt == 1 and !list_empty mean it's not being used anywhere + * and deactivated will be set, so it's time to notify userland + * that this shkey can be freed. + */ + if (asoc && !list_empty(&key->key_list) && + refcount_read(&key->refcnt) == 1) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, key->key_id, + SCTP_AUTH_FREE_KEY, GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } + key->deactivated = 1; return 0; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 10f071cdf188..cc20bc39ee7c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -89,8 +89,26 @@ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; - if (chunk->shkey) + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + struct sctp_association *asoc = chunk->asoc; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } sctp_auth_shkey_release(chunk->shkey); + } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 792e0e2be320..1e41dee70b51 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4246,7 +4246,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net, struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id), - SCTP_AUTH_NEWKEY, GFP_ATOMIC); + SCTP_AUTH_NEW_KEY, GFP_ATOMIC); if (!ev) return -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 65cc354c520f..aeecdd620c45 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8166,8 +8166,25 @@ static void sctp_wfree(struct sk_buff *skb) sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); - if (chunk->shkey) + if (chunk->shkey) { + struct sctp_shared_key *shkey = chunk->shkey; + + /* refcnt == 2 and !list_empty mean after this release, it's + * not being used anywhere, and it's time to notify userland + * that this shkey can be freed if it's been deactivated. + */ + if (shkey->deactivated && !list_empty(&shkey->key_list) && + refcount_read(&shkey->refcnt) == 2) { + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, + SCTP_AUTH_FREE_KEY, + GFP_KERNEL); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); + } sctp_auth_shkey_release(chunk->shkey); + } sock_wfree(skb); sctp_wake_up_waiters(sk, asoc); -- cgit v1.2.3-70-g09d2 From 30f6ebf65bc46161c5aaff1db2e6e7c76aa4a06b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 14 Mar 2018 19:05:34 +0800 Subject: sctp: add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT This patch is to add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT, as described in section 6.1.8 of RFC6458. SCTP_AUTH_NO_AUTH: This report indicates that the peer does not support SCTP authentication as defined in [RFC4895]. Note that the implementation is quite similar as that of SCTP_ADAPTATION_INDICATION. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 + include/uapi/linux/sctp.h | 1 + net/sctp/sm_sideeffect.c | 13 +++++++++++++ net/sctp/sm_statefuns.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 56 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index b55c6a48a206..6640f84fe536 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -100,6 +100,7 @@ enum sctp_verb { SCTP_CMD_SET_SK_ERR, /* Set sk_err */ SCTP_CMD_ASSOC_CHANGE, /* generate and send assoc_change event */ SCTP_CMD_ADAPTATION_IND, /* generate and send adaptation event */ + SCTP_CMD_PEER_NO_AUTH, /* generate and send authentication event */ SCTP_CMD_ASSOC_SHKEY, /* generate the association shared keys */ SCTP_CMD_T1_RETRAN, /* Mark for retransmission after T1 timeout */ SCTP_CMD_UPDATE_INITTAG, /* Update peer inittag */ diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 18ebbfeee4af..afd4346386e0 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -522,6 +522,7 @@ enum { SCTP_AUTH_NEW_KEY, #define SCTP_AUTH_NEWKEY SCTP_AUTH_NEW_KEY /* compatible with before */ SCTP_AUTH_FREE_KEY, + SCTP_AUTH_NO_AUTH, }; /* diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index b71e7fb0a20a..298112ca8c06 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1049,6 +1049,16 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } +static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, + struct sctp_association *asoc) +{ + struct sctp_ulpevent *ev; + + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); + if (ev) + asoc->stream.si->enqueue_event(&asoc->ulpq, ev); +} + /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) @@ -1755,6 +1765,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; + case SCTP_CMD_PEER_NO_AUTH: + sctp_cmd_peer_no_auth(commands, asoc); + break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 1e41dee70b51..cc56a67dbb4d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -659,7 +659,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, void *arg, struct sctp_cmd_seq *commands) { - struct sctp_ulpevent *ev, *ai_ev = NULL; + struct sctp_ulpevent *ev, *ai_ev = NULL, *auth_ev = NULL; struct sctp_association *new_asoc; struct sctp_init_chunk *peer_init; struct sctp_chunk *chunk = arg; @@ -820,6 +820,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, goto nomem_aiev; } + if (!new_asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(new_asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem_authev; + } + /* Add all the state machine commands now since we've created * everything. This way we don't introduce memory corruptions * during side-effect processing and correclty count established @@ -847,8 +855,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); + return SCTP_DISPOSITION_CONSUME; +nomem_authev: + sctp_ulpevent_free(ai_ev); nomem_aiev: sctp_ulpevent_free(ev); nomem_ev: @@ -953,6 +967,15 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, SCTP_ULPEVENT(ev)); } + if (!asoc->peer.auth_capable) { + ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!ev) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + return SCTP_DISPOSITION_CONSUME; nomem: return SCTP_DISPOSITION_NOMEM; @@ -1908,6 +1931,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( if (asoc->peer.adaptation_ind) sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL()); + if (!asoc->peer.auth_capable) + sctp_add_cmd_sf(commands, SCTP_CMD_PEER_NO_AUTH, SCTP_NULL()); + return SCTP_DISPOSITION_CONSUME; nomem: @@ -1954,7 +1980,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( struct sctp_cmd_seq *commands, struct sctp_association *new_asoc) { - struct sctp_ulpevent *ev = NULL, *ai_ev = NULL; + struct sctp_ulpevent *ev = NULL, *ai_ev = NULL, *auth_ev = NULL; struct sctp_chunk *repl; /* Clarification from Implementor's Guide: @@ -2001,6 +2027,14 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( goto nomem; } + + if (!asoc->peer.auth_capable) { + auth_ev = sctp_ulpevent_make_authkey(asoc, 0, + SCTP_AUTH_NO_AUTH, + GFP_ATOMIC); + if (!auth_ev) + goto nomem; + } } repl = sctp_make_cookie_ack(new_asoc, chunk); @@ -2015,10 +2049,15 @@ static enum sctp_disposition sctp_sf_do_dupcook_d( if (ai_ev) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ai_ev)); + if (auth_ev) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(auth_ev)); return SCTP_DISPOSITION_CONSUME; nomem: + if (auth_ev) + sctp_ulpevent_free(auth_ev); if (ai_ev) sctp_ulpevent_free(ai_ev); if (ev) -- cgit v1.2.3-70-g09d2 From 0c3d5a96d5e5e6d5662d335a3bdfb76f35433f18 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 12 Mar 2018 08:07:12 -0700 Subject: net: drivers/net: Remove unnecessary skb_copy_expand OOM messages skb_copy_expand without __GFP_NOWARN already does a dump_stack on OOM so these messages are redundant. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/qca_spi.c | 1 - drivers/net/usb/lg-vl600.c | 6 +----- drivers/net/wimax/i2400m/usb-rx.c | 3 --- drivers/net/wireless/ti/wl1251/tx.c | 4 +--- drivers/usb/gadget/function/f_eem.c | 1 - net/mac80211/rx.c | 5 +---- net/netfilter/nfnetlink_queue.c | 5 +---- 7 files changed, 4 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 9c236298fe21..5803cd6db406 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -705,7 +705,6 @@ qcaspi_netdev_xmit(struct sk_buff *skb, struct net_device *dev) tskb = skb_copy_expand(skb, QCAFRM_HEADER_LEN, QCAFRM_FOOTER_LEN + pad_len, GFP_ATOMIC); if (!tskb) { - netdev_dbg(qca->net_dev, "could not allocate tx_buff\n"); qca->stats.out_of_mem++; return NETDEV_TX_BUSY; } diff --git a/drivers/net/usb/lg-vl600.c b/drivers/net/usb/lg-vl600.c index dbabd7ca5268..257916f172cd 100644 --- a/drivers/net/usb/lg-vl600.c +++ b/drivers/net/usb/lg-vl600.c @@ -157,12 +157,8 @@ static int vl600_rx_fixup(struct usbnet *dev, struct sk_buff *skb) s->current_rx_buf = skb_copy_expand(skb, 0, le32_to_cpup(&frame->len), GFP_ATOMIC); - if (!s->current_rx_buf) { - netif_err(dev, ifup, dev->net, "Reserving %i bytes " - "for packet assembly failed.\n", - le32_to_cpup(&frame->len)); + if (!s->current_rx_buf) dev->net->stats.rx_errors++; - } return 0; } diff --git a/drivers/net/wimax/i2400m/usb-rx.c b/drivers/net/wimax/i2400m/usb-rx.c index b78ee676e102..5b64bda7d9e7 100644 --- a/drivers/net/wimax/i2400m/usb-rx.c +++ b/drivers/net/wimax/i2400m/usb-rx.c @@ -263,9 +263,6 @@ retry: new_skb = skb_copy_expand(rx_skb, 0, rx_size - rx_skb->len, GFP_KERNEL); if (new_skb == NULL) { - if (printk_ratelimit()) - dev_err(dev, "RX: Can't reallocate skb to %d; " - "RX dropped\n", rx_size); kfree_skb(rx_skb); rx_skb = NULL; goto out; /* drop it...*/ diff --git a/drivers/net/wireless/ti/wl1251/tx.c b/drivers/net/wireless/ti/wl1251/tx.c index de2fa6705574..12ed14ebc307 100644 --- a/drivers/net/wireless/ti/wl1251/tx.c +++ b/drivers/net/wireless/ti/wl1251/tx.c @@ -221,10 +221,8 @@ static int wl1251_tx_send_packet(struct wl1251 *wl, struct sk_buff *skb, struct sk_buff *newskb = skb_copy_expand(skb, 0, 3, GFP_KERNEL); - if (unlikely(newskb == NULL)) { - wl1251_error("Can't allocate skb!"); + if (unlikely(newskb == NULL)) return -EINVAL; - } tx_hdr = (struct tx_double_buffer_desc *) newskb->data; diff --git a/drivers/usb/gadget/function/f_eem.c b/drivers/usb/gadget/function/f_eem.c index 37557651b600..c13befa31110 100644 --- a/drivers/usb/gadget/function/f_eem.c +++ b/drivers/usb/gadget/function/f_eem.c @@ -507,7 +507,6 @@ static int eem_unwrap(struct gether *port, 0, GFP_ATOMIC); if (unlikely(!skb3)) { - DBG(cdev, "unable to realign EEM packet\n"); dev_kfree_skb_any(skb2); continue; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d01743234cf6..9c898a3688c6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2549,11 +2549,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) fwd_skb = skb_copy_expand(skb, local->tx_headroom + sdata->encrypt_headroom, 0, GFP_ATOMIC); - if (!fwd_skb) { - net_info_ratelimited("%s: failed to clone mesh frame\n", - sdata->name); + if (!fwd_skb) goto out; - } fwd_hdr = (struct ieee80211_hdr *) fwd_skb->data; fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 59e2833c17f1..9f572ed56208 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -833,11 +833,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); + if (!nskb) return -ENOMEM; - } kfree_skb(e->skb); e->skb = nskb; } -- cgit v1.2.3-70-g09d2 From 0aee4c259849099cb07ead6cd7fff74e561d5225 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 12 Mar 2018 14:15:25 -0400 Subject: sctp: Fix double free in sctp_sendmsg_to_asoc syzbot/kasan detected a double free in sctp_sendmsg_to_asoc: BUG: KASAN: use-after-free in sctp_association_free+0x7b7/0x930 net/sctp/associola.c:332 Read of size 8 at addr ffff8801d8006ae0 by task syzkaller914861/4202 CPU: 1 PID: 4202 Comm: syzkaller914861 Not tainted 4.16.0-rc4+ #258 Hardware name: Google Google Compute Engine/Google Compute Engine 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x24d lib/dump_stack.c:53 print_address_description+0x73/0x250 mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report+0x23c/0x360 mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 sctp_association_free+0x7b7/0x930 net/sctp/associola.c:332 sctp_sendmsg+0xc67/0x1a80 net/sctp/socket.c:2075 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:629 [inline] sock_sendmsg+0xca/0x110 net/socket.c:639 SYSC_sendto+0x361/0x5c0 net/socket.c:1748 SyS_sendto+0x40/0x50 net/socket.c:1716 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 This was introduced by commit: f84af33 sctp: factor out sctp_sendmsg_to_asoc from sctp_sendmsg As the newly refactored function moved the wait_for_sndbuf call to a point after the association was connected, allowing for peeloff events to occur, which in turn caused wait_for_sndbuf to return -EPIPE which was not caught by the logic that determines if an association should be freed or not. Fix it the easy way by returning the ordering of sctp_primitive_ASSOCIATE and sctp_wait_for_sndbuf to the old order, to ensure that EPIPE will not happen. Tested by myself using the syzbot reproducers with positive results Signed-off-by: Neil Horman CC: davem@davemloft.net CC: Xin Long Reported-by: syzbot+a4e4112c3aff00c8cfd8@syzkaller.appspotmail.com Reviewed-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aeecdd620c45..7a10ae3c3d82 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1883,6 +1883,19 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, goto err; } + if (asoc->pmtu_pending) + sctp_assoc_pending_pmtu(asoc); + + if (sctp_wspace(asoc) < msg_len) + sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); + + if (!sctp_wspace(asoc)) { + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + if (err) + goto err; + } + if (sctp_state(asoc, CLOSED)) { err = sctp_primitive_ASSOCIATE(net, asoc, NULL); if (err) @@ -1900,19 +1913,6 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, pr_debug("%s: we associated primitively\n", __func__); } - if (asoc->pmtu_pending) - sctp_assoc_pending_pmtu(asoc); - - if (sctp_wspace(asoc) < msg_len) - sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc)); - - if (!sctp_wspace(asoc)) { - timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); - if (err) - goto err; - } - datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter); if (IS_ERR(datamsg)) { err = PTR_ERR(datamsg); -- cgit v1.2.3-70-g09d2 From 650b4eca47399420a1644056c41b66735e250d35 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Mar 2018 17:25:38 +0000 Subject: rxrpc: remove redundant initialization of variable 'len' The variable 'len' is being initialized with a value that is never read and it is re-assigned later, hence the initialization is redundant and can be removed. Cleans up clang warning: net/rxrpc/recvmsg.c:275:15: warning: Value stored to 'len' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/rxrpc/recvmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 9d45d8b56744..7bff716e911e 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -272,7 +272,7 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, unsigned int *_offset, unsigned int *_len) { unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len = *_len; + unsigned int len; int ret; u8 annotation = *_annotation; -- cgit v1.2.3-70-g09d2 From 9fbb704c3385adf5f9adfce7fd3c0bf31aa4da6d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Mar 2018 08:29:36 -0700 Subject: net/ipv6: Refactor gateway validation on route add Move gateway validation code from ip6_route_info_create into ip6_validate_gw. Code move plus adjustments to handle the potential reset of dev and idev and to make checkpatch happy. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/route.c | 120 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 66 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 81711e3e2604..23ced851fdb1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2550,7 +2550,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, - struct net_device *dev, + const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; @@ -2626,6 +2626,68 @@ out: return err; } +static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, + struct net_device **_dev, struct inet6_dev **idev, + struct netlink_ext_ack *extack) +{ + const struct in6_addr *gw_addr = &cfg->fc_gateway; + int gwa_type = ipv6_addr_type(gw_addr); + const struct net_device *dev = *_dev; + int err = -EINVAL; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + if (ipv6_chk_addr_and_flags(net, gw_addr, + gwa_type & IPV6_ADDR_LINKLOCAL ? + dev : NULL, 0, 0)) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); + goto out; + } + + if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { + /* IPv6 strictly inhibits using not link-local + * addresses as nexthop address. + * Otherwise, router will not able to send redirects. + * It is very good, but in some (rare!) circumstances + * (SIT, PtP, NBMA NOARP links) it is handy to allow + * some exceptions. --ANK + * We allow IPv4-mapped nexthops to support RFC4798-type + * addressing + */ + if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { + NL_SET_ERR_MSG(extack, "Invalid gateway address"); + goto out; + } + + if (cfg->fc_flags & RTNH_F_ONLINK) + err = ip6_route_check_nh_onlink(net, cfg, dev, extack); + else + err = ip6_route_check_nh(net, cfg, _dev, idev); + + if (err) + goto out; + } + + /* reload in case device was changed */ + dev = *_dev; + + err = -EINVAL; + if (!dev) { + NL_SET_ERR_MSG(extack, "Egress device not specified"); + goto out; + } else if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, + "Egress device can not be loopback device for this route"); + goto out; + } + err = 0; +out: + return err; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -2808,61 +2870,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, } if (cfg->fc_flags & RTF_GATEWAY) { - const struct in6_addr *gw_addr; - int gwa_type; - - gw_addr = &cfg->fc_gateway; - gwa_type = ipv6_addr_type(gw_addr); - - /* if gw_addr is local we will fail to detect this in case - * address is still TENTATIVE (DAD in progress). rt6_lookup() - * will return already-added prefix route via interface that - * prefix route was assigned to, which might be non-loopback. - */ - err = -EINVAL; - if (ipv6_chk_addr_and_flags(net, gw_addr, - gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) { - NL_SET_ERR_MSG(extack, "Invalid gateway address"); + err = ip6_validate_gw(net, cfg, &dev, &idev, extack); + if (err) goto out; - } - rt->rt6i_gateway = *gw_addr; - - if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - /* IPv6 strictly inhibits using not link-local - addresses as nexthop address. - Otherwise, router will not able to send redirects. - It is very good, but in some (rare!) circumstances - (SIT, PtP, NBMA NOARP links) it is handy to allow - some exceptions. --ANK - We allow IPv4-mapped nexthops to support RFC4798-type - addressing - */ - if (!(gwa_type & (IPV6_ADDR_UNICAST | - IPV6_ADDR_MAPPED))) { - NL_SET_ERR_MSG(extack, - "Invalid gateway address"); - goto out; - } - if (cfg->fc_flags & RTNH_F_ONLINK) { - err = ip6_route_check_nh_onlink(net, cfg, dev, - extack); - } else { - err = ip6_route_check_nh(net, cfg, &dev, &idev); - } - if (err) - goto out; - } - err = -EINVAL; - if (!dev) { - NL_SET_ERR_MSG(extack, "Egress device not specified"); - goto out; - } else if (dev->flags & IFF_LOOPBACK) { - NL_SET_ERR_MSG(extack, - "Egress device can not be loopback device for this route"); - goto out; - } + rt->rt6i_gateway = cfg->fc_gateway; } err = -ENODEV; -- cgit v1.2.3-70-g09d2 From 232378e8db4780bc7145d7a0ee47f5f80a41ad6b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Mar 2018 08:29:37 -0700 Subject: net/ipv6: Change address check to always take a device argument ipv6_chk_addr_and_flags determines if an address is a local address and optionally if it is an address on a specific device. For example, it is called by ip6_route_info_create to determine if a given gateway address is a local address. The address check currently does not consider L3 domains and as a result does not allow a route to be added in one VRF if the nexthop points to an address in a second VRF. e.g., $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23 Error: Invalid gateway address. where 2001:db8:102::23 is an address on an interface in vrf r1. ipv6_chk_addr_and_flags needs to allow callers to always pass in a device with a separate argument to not limit the address to the specific device. The device is used used to determine the L3 domain of interest. To that end add an argument to skip the device check and update callers to always pass a device where possible and use the new argument to mean any address in the domain. Update a handful of users of ipv6_chk_addr with a NULL dev argument. This patch handles the change to these callers without adding the domain check. ip6_validate_gw needs to handle 2 cases - one where the device is given as part of the nexthop spec and the other where the device is resolved. There is at least 1 VRF case where deferring the check to only after the route lookup has resolved the device fails with an unintuitive error "RTNETLINK answers: No route to host" as opposed to the preferred "Error: Gateway can not be a local address." The 'no route to host' error is because of the fallback to a full lookup. The check is done twice to avoid this error. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/addrconf.h | 4 ++-- net/ipv6/addrconf.c | 11 ++++++++--- net/ipv6/anycast.c | 9 ++++++--- net/ipv6/datagram.c | 5 +++-- net/ipv6/ip6_tunnel.c | 12 ++++++++---- net/ipv6/ndisc.c | 2 +- net/ipv6/route.c | 19 +++++++++++++++---- 7 files changed, 43 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index c4185a7b0e90..132e5b95167a 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, - const struct net_device *dev, int strict, - u32 banned_flags); + const struct net_device *dev, bool skip_dev_check, + int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b5fd116c046a..0677b9732d56 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1851,19 +1851,24 @@ static int ipv6_count_addresses(const struct inet6_dev *idev) int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict) { - return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE); + return ipv6_chk_addr_and_flags(net, addr, dev, !dev, + strict, IFA_F_TENTATIVE); } EXPORT_SYMBOL(ipv6_chk_addr); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, - const struct net_device *dev, int strict, - u32 banned_flags) + const struct net_device *dev, bool skip_dev_check, + int strict, u32 banned_flags) { unsigned int hash = inet6_addr_hash(net, addr); struct inet6_ifaddr *ifp; u32 ifp_flags; rcu_read_lock(); + + if (skip_dev_check) + dev = NULL; + hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index c61718dba2e6..d580d4d456a5 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return -EPERM; if (ipv6_addr_is_multicast(addr)) return -EINVAL; - if (ipv6_chk_addr(net, addr, NULL, 0)) + + if (ifindex) + dev = __dev_get_by_index(net, ifindex); + + if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) return -EINVAL; pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); @@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) dev = __dev_get_by_flags(net, IFF_UP, IFF_UP | IFF_LOOPBACK); } - } else - dev = __dev_get_by_index(net, ifindex); + } if (!dev) { err = -ENODEV; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index fbf08ce3f5ab..b27333d7b099 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) && - !ipv6_chk_addr(net, &src_info->ipi6_addr, - strict ? dev : NULL, 0) && + !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, + dev, !strict, 0, + IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5c045fa407da..456fcf942f95 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || - likely(ipv6_chk_addr(net, laddr, ldev, 0))) && + likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, + 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || - likely(!ipv6_chk_addr(net, raddr, NULL, 0)))) + likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, + 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; @@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, if (p->link) ldev = dev_get_by_index_rcu(net, p->link); - if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) + if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, + 0, IFA_F_TENTATIVE))) pr_warn("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && - unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) + unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, + true, 0, IFA_F_TENTATIVE))) pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 8af5eef464c1..10024eb0c521 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) int probes = atomic_read(&neigh->probes); if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr, - dev, 1, + dev, false, 1, IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) saddr = &ipv6_hdr(skb)->saddr; probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 23ced851fdb1..939d122e71b4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2632,7 +2632,9 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, { const struct in6_addr *gw_addr = &cfg->fc_gateway; int gwa_type = ipv6_addr_type(gw_addr); + bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; const struct net_device *dev = *_dev; + bool need_addr_check = !dev; int err = -EINVAL; /* if gw_addr is local we will fail to detect this in case @@ -2640,10 +2642,9 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, * will return already-added prefix route via interface that * prefix route was assigned to, which might be non-loopback. */ - if (ipv6_chk_addr_and_flags(net, gw_addr, - gwa_type & IPV6_ADDR_LINKLOCAL ? - dev : NULL, 0, 0)) { - NL_SET_ERR_MSG(extack, "Invalid gateway address"); + if (dev && + ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { + NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } @@ -2683,6 +2684,16 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, "Egress device can not be loopback device for this route"); goto out; } + + /* if we did not check gw_addr above, do so now that the + * egress device has been resolved. + */ + if (need_addr_check && + ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { + NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); + goto out; + } + err = 0; out: return err; -- cgit v1.2.3-70-g09d2 From 1893ff20275b9b9be4892b5b5785788ce45abdea Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Mar 2018 08:29:38 -0700 Subject: net/ipv6: Add l3mdev check to ipv6_chk_addr_and_flags Lookup the L3 master device for the passed in device. Only consider addresses on netdev's with the same master device. If the device is not enslaved or is NULL, then the l3mdev is NULL which means only devices not enslaved (ie, in the default domain) are considered. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0677b9732d56..6fd4bbdc444f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1856,22 +1856,37 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, } EXPORT_SYMBOL(ipv6_chk_addr); +/* device argument is used to find the L3 domain of interest. If + * skip_dev_check is set, then the ifp device is not checked against + * the passed in dev argument. So the 2 cases for addresses checks are: + * 1. does the address exist in the L3 domain that dev is part of + * (skip_dev_check = true), or + * + * 2. does the address exist on the specific device + * (skip_dev_check = false) + */ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags) { unsigned int hash = inet6_addr_hash(net, addr); + const struct net_device *l3mdev; struct inet6_ifaddr *ifp; u32 ifp_flags; rcu_read_lock(); + l3mdev = l3mdev_master_dev_rcu(dev); if (skip_dev_check) dev = NULL; hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { if (!net_eq(dev_net(ifp->idev->dev), net)) continue; + + if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev) + continue; + /* Decouple optimistic from tentative for evaluation here. * Ban optimistic addresses explicitly, when required. */ -- cgit v1.2.3-70-g09d2 From 1e8029515816f771b9b3751f24f19fe6df4c72ae Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 13 Mar 2018 21:57:16 -0700 Subject: udp: Move the udp sysctl to namespace. This patch moves the udp_rmem_min, udp_wmem_min to namespace and init the udp_l3mdev_accept explicitly. The udp_rmem_min/udp_wmem_min affect udp rx/tx queue, with this patch namespaces can set them differently. Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 ++ net/ipv4/sysctl_net_ipv4.c | 32 ++++++++--------- net/ipv4/udp.c | 86 +++++++++++++++++++++++++++------------------- net/ipv6/udp.c | 52 ++++++++++++++-------------- 4 files changed, 96 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 3a970e429ab6..382bfd7583cf 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -168,6 +168,9 @@ struct netns_ipv4 { atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + int sysctl_udp_wmem_min; + int sysctl_udp_rmem_min; + #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; #endif diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 011de9a20ec6..5b72d97693f8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -520,22 +520,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, - { - .procname = "udp_rmem_min", - .data = &sysctl_udp_rmem_min, - .maxlen = sizeof(sysctl_udp_rmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, - { - .procname = "udp_wmem_min", - .data = &sysctl_udp_wmem_min, - .maxlen = sizeof(sysctl_udp_wmem_min), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, { } }; @@ -1167,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one, }, + { + .procname = "udp_rmem_min", + .data = &init_net.ipv4.sysctl_udp_rmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, + { + .procname = "udp_wmem_min", + .data = &init_net.ipv4.sysctl_udp_wmem_min, + .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, { } }; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3013404d0935..908fc02fb4f8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -122,12 +122,6 @@ EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); -int sysctl_udp_rmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_rmem_min); - -int sysctl_udp_wmem_min __read_mostly; -EXPORT_SYMBOL(sysctl_udp_wmem_min); - atomic_long_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); @@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err) EXPORT_SYMBOL_GPL(udp_abort); struct proto udp_prot = { - .name = "UDP", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip4_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udp_init_sock, - .destroy = udp_destroy_sock, - .setsockopt = udp_setsockopt, - .getsockopt = udp_getsockopt, - .sendmsg = udp_sendmsg, - .recvmsg = udp_recvmsg, - .sendpage = udp_sendpage, - .release_cb = ip4_datagram_release_cb, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v4_rehash, - .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem = &sysctl_udp_wmem_min, - .sysctl_rmem = &sysctl_udp_rmem_min, - .obj_size = sizeof(struct udp_sock), - .h.udp_table = &udp_table, + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .release_cb = ip4_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v4_rehash, + .get_port = udp_v4_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), + .obj_size = sizeof(struct udp_sock), + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udp_setsockopt, - .compat_getsockopt = compat_udp_getsockopt, + .compat_setsockopt = compat_udp_setsockopt, + .compat_getsockopt = compat_udp_getsockopt, #endif - .diag_destroy = udp_abort, + .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -2831,6 +2825,26 @@ u32 udp_flow_hashrnd(void) } EXPORT_SYMBOL(udp_flow_hashrnd); +static void __udp_sysctl_init(struct net *net) +{ + net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM; + net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM; + +#ifdef CONFIG_NET_L3_MASTER_DEV + net->ipv4.sysctl_udp_l3mdev_accept = 0; +#endif +} + +static int __net_init udp_sysctl_init(struct net *net) +{ + __udp_sysctl_init(net); + return 0; +} + +static struct pernet_operations __net_initdata udp_sysctl_ops = { + .init = udp_sysctl_init, +}; + void __init udp_init(void) { unsigned long limit; @@ -2843,8 +2857,7 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - sysctl_udp_rmem_min = SK_MEM_QUANTUM; - sysctl_udp_wmem_min = SK_MEM_QUANTUM; + __udp_sysctl_init(&init_net); /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; @@ -2854,4 +2867,7 @@ void __init udp_init(void) panic("UDP: failed to alloc udp_busylocks\n"); for (i = 0; i < (1U << udp_busylocks_log); i++) spin_lock_init(udp_busylocks + i); + + if (register_pernet_subsys(&udp_sysctl_ops)) + panic("UDP: failed to init sysctl parameters.\n"); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 52e3ea0e6f50..ad30f5e31969 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1509,34 +1509,34 @@ void udp6_proc_exit(struct net *net) /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { - .name = "UDPv6", - .owner = THIS_MODULE, - .close = udp_lib_close, - .connect = ip6_datagram_connect, - .disconnect = udp_disconnect, - .ioctl = udp_ioctl, - .init = udp_init_sock, - .destroy = udpv6_destroy_sock, - .setsockopt = udpv6_setsockopt, - .getsockopt = udpv6_getsockopt, - .sendmsg = udpv6_sendmsg, - .recvmsg = udpv6_recvmsg, - .release_cb = ip6_datagram_release_cb, - .hash = udp_lib_hash, - .unhash = udp_lib_unhash, - .rehash = udp_v6_rehash, - .get_port = udp_v6_get_port, - .memory_allocated = &udp_memory_allocated, - .sysctl_mem = sysctl_udp_mem, - .sysctl_wmem = &sysctl_udp_wmem_min, - .sysctl_rmem = &sysctl_udp_rmem_min, - .obj_size = sizeof(struct udp6_sock), - .h.udp_table = &udp_table, + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udp_lib_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .init = udp_init_sock, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .release_cb = ip6_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, + .rehash = udp_v6_rehash, + .get_port = udp_v6_get_port, + .memory_allocated = &udp_memory_allocated, + .sysctl_mem = sysctl_udp_mem, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), + .obj_size = sizeof(struct udp6_sock), + .h.udp_table = &udp_table, #ifdef CONFIG_COMPAT - .compat_setsockopt = compat_udpv6_setsockopt, - .compat_getsockopt = compat_udpv6_getsockopt, + .compat_setsockopt = compat_udpv6_setsockopt, + .compat_getsockopt = compat_udpv6_getsockopt, #endif - .diag_destroy = udp_abort, + .diag_destroy = udp_abort, }; static struct inet_protosw udpv6_protosw = { -- cgit v1.2.3-70-g09d2 From 79ffdfc6522ae33d8a33e971070c08ee5f27439b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Mar 2018 22:17:20 +0300 Subject: net: Add rtnl_lock_killable() rtnl_lock() is widely used mutex in kernel. Some of kernel code does memory allocations under it. In case of memory deficit this may invoke OOM killer, but the problem is a killed task can't exit if it's waiting for the mutex. This may be a reason of deadlock and panic. This patch adds a new primitive, which responds on SIGKILL, and it allows to use it in the places, where we don't want to sleep forever. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + net/core/rtnetlink.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3573b4bf2fdf..562a175c35a9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -33,6 +33,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore net_sem; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 67f375cfb982..87079eaa871b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -75,6 +75,12 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_killable(void) +{ + return mutex_lock_killable(&rtnl_mutex); +} +EXPORT_SYMBOL(rtnl_lock_killable); + static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { -- cgit v1.2.3-70-g09d2 From b0f3debc9a1284d6b861e3f7cce0d119e6cd601d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Mar 2018 22:17:28 +0300 Subject: net: Use rtnl_lock_killable() in register_netdev() This patch adds rtnl_lock_killable() to one of hot path using rtnl_lock(). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 12a9aad0b057..d8887cc38e7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8018,7 +8018,8 @@ int register_netdev(struct net_device *dev) { int err; - rtnl_lock(); + if (rtnl_lock_killable()) + return -EINTR; err = register_netdevice(dev); rtnl_unlock(); return err; -- cgit v1.2.3-70-g09d2 From c246d942eabc3288f5d93930663411070093ac52 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 16 Mar 2018 15:06:39 +0100 Subject: net/smc: restructure netinfo for CLC proposal msgs Introduce functions smc_clc_prfx_set to retrieve IP information for the CLC proposal msg and smc_clc_prfx_match to match the contents of a proposal message against the IP addresses of the net device. The new functions replace the functionality provided by smc_clc_netinfo_by_tcpsk, which is removed by this patch. The match functionality is extended to scan all ipv4 addresses of the net device for a match against the ipv4 subnet from the proposal msg. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 14 ++------ net/smc/smc_clc.c | 100 +++++++++++++++++++++++++++++++++++++++++------------- net/smc/smc_clc.h | 4 +-- 3 files changed, 82 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 649489f825a5..949a2714a453 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -767,8 +767,6 @@ static void smc_listen_work(struct work_struct *work) struct smc_link *link; int reason_code = 0; int rc = 0; - __be32 subnet; - u8 prefix_len; u8 ibport; /* check if peer is smc capable */ @@ -803,17 +801,11 @@ static void smc_listen_work(struct work_struct *work) goto decline_rdma; } - /* determine subnet and mask from internal TCP socket */ - rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); - if (pclc_prfx->outgoing_subnet != subnet || - pclc_prfx->prefix_len != prefix_len) { + + rc = smc_clc_prfx_match(newclcsock, pclc_prfx); + if (rc) { reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ goto decline_rdma; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 874c5a75d6dd..dc3a2235978d 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -74,15 +74,35 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm) return true; } -/* determine subnet and mask of internal TCP socket */ -int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, - __be32 *subnet, u8 *prefix_len) +/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ +static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + + if (!in_dev) + return -ENODEV; + for_ifa(in_dev) { + if (!inet_ifa_match(ipv4, ifa)) + continue; + prop->prefix_len = inet_mask_len(ifa->ifa_mask); + prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask; + /* prop->ipv6_prefixes_cnt = 0; already done by memset before */ + return 0; + } endfor_ifa(in_dev); + return -ENOENT; +} + +/* retrieve and set prefixes in CLC proposal msg */ +static int smc_clc_prfx_set(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop) { struct dst_entry *dst = sk_dst_get(clcsock->sk); - struct in_device *in_dev; - struct sockaddr_in addr; + struct sockaddr_storage addrs; + struct sockaddr_in *addr; int rc = -ENOENT; + memset(prop, 0, sizeof(*prop)); if (!dst) { rc = -ENOTCONN; goto out; @@ -91,22 +111,58 @@ int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, rc = -ENODEV; goto out_rel; } - /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addr); - /* analyze IPv4 specific data of net_device belonging to TCP socket */ + kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + /* analyze IP specific data of net_device belonging to TCP socket */ rcu_read_lock(); - in_dev = __in_dev_get_rcu(dst->dev); + if (addrs.ss_family == PF_INET) { + /* IPv4 */ + addr = (struct sockaddr_in *)&addrs; + rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + } + rcu_read_unlock(); +out_rel: + dst_release(dst); +out: + return rc; +} + +/* match ipv4 addrs of dev against addr in CLC proposal */ +static int smc_clc_prfx_match4_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (!in_dev) + return -ENODEV; for_ifa(in_dev) { - if (!inet_ifa_match(addr.sin_addr.s_addr, ifa)) - continue; - *prefix_len = inet_mask_len(ifa->ifa_mask); - *subnet = ifa->ifa_address & ifa->ifa_mask; - rc = 0; - break; + if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) && + inet_ifa_match(prop->outgoing_subnet, ifa)) + return 0; } endfor_ifa(in_dev); - rcu_read_unlock(); + return -ENOENT; +} + +/* check if proposed prefixes match one of our device prefixes */ +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + int rc = -ENOENT; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) + rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + rcu_read_unlock(); out_rel: dst_release(dst); out: @@ -240,6 +296,11 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct msghdr msg; int len, plen, rc; + /* retrieve ip prefixes for CLC proposal msg */ + rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + /* send SMC Proposal CLC message */ plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); @@ -252,13 +313,6 @@ int smc_clc_send_proposal(struct smc_sock *smc, memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN); pclc.iparea_offset = htons(0); - memset(&pclc_prfx, 0, sizeof(pclc_prfx)); - /* determine subnet and mask from internal TCP socket */ - rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet, - &pclc_prfx.prefix_len); - if (rc) - return SMC_CLC_DECL_CNFERR; /* configuration error */ - pclc_prfx.ipv6_prefixes_cnt = 0; memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); vec[0].iov_base = &pclc; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 20e048beac30..5ddb0d22eb8a 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -122,8 +122,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); } -int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, - u8 *prefix_len); +int smc_clc_prfx_match(struct socket *clcsock, + struct smc_clc_msg_proposal_prefix *prop); int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type); int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info); -- cgit v1.2.3-70-g09d2 From 1a26d0201d7670fd6c9086e15504911ce240e6ff Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 16 Mar 2018 15:06:40 +0100 Subject: net/smc: add ipv6 support to CLC layer The CLC layer is updated to support ipv6 proposal messages from peers and to match incoming proposal messages against the ipv6 addresses of the net device. struct smc_clc_ipv6_prefix is updated to provide the space for an ipv6 address (struct was not used before). SMC_CLC_MAX_LEN is updated to include the size of the proposal prefix. Existing code in net is not affected, the previous SMC_CLC_MAX_LEN value is large enough to hold ipv4 proposal messages. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++------- net/smc/smc_clc.h | 13 +++++-- 2 files changed, 105 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index dc3a2235978d..64fbc3230e6c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -5,7 +5,7 @@ * CLC (connection layer control) handshake over initial TCP socket to * prepare for RDMA traffic * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun */ @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -93,12 +94,44 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, return -ENOENT; } +/* fill CLC proposal msg with ipv6 prefixes from device */ +static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_ifaddr *ifa; + int cnt = 0; + + if (!in6_dev) + return -ENODEV; + /* use a maximum of 8 IPv6 prefixes from device */ + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + ipv6_addr_prefix(&ipv6_prfx[cnt].prefix, + &ifa->addr, ifa->prefix_len); + ipv6_prfx[cnt].prefix_len = ifa->prefix_len; + cnt++; + if (cnt == SMC_CLC_MAX_V6_PREFIX) + break; + } + prop->ipv6_prefixes_cnt = cnt; + if (cnt) + return 0; +#endif + return -ENOENT; +} + /* retrieve and set prefixes in CLC proposal msg */ static int smc_clc_prfx_set(struct socket *clcsock, - struct smc_clc_msg_proposal_prefix *prop) + struct smc_clc_msg_proposal_prefix *prop, + struct smc_clc_ipv6_prefix *ipv6_prfx) { struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; + struct sockaddr_in6 *addr6; struct sockaddr_in *addr; int rc = -ENOENT; @@ -114,11 +147,19 @@ static int smc_clc_prfx_set(struct socket *clcsock, /* get address to which the internal TCP socket is bound */ kernel_getsockname(clcsock, (struct sockaddr *)&addrs); /* analyze IP specific data of net_device belonging to TCP socket */ + addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { + /* mapped IPv4 address - peer is IPv4 only */ + rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + prop); + } else { + /* IPv6 */ + rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); } rcu_read_unlock(); out_rel: @@ -144,12 +185,41 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev, return -ENOENT; } +/* match ipv6 addrs of dev against addrs in CLC proposal */ +static int smc_clc_prfx_match6_rcu(struct net_device *dev, + struct smc_clc_msg_proposal_prefix *prop) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev = __in6_dev_get(dev); + struct smc_clc_ipv6_prefix *ipv6_prfx; + struct inet6_ifaddr *ifa; + int i, max; + + if (!in6_dev) + return -ENODEV; + /* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */ + ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop)); + max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX); + list_for_each_entry(ifa, &in6_dev->addr_list, if_list) { + if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL) + continue; + for (i = 0; i < max; i++) { + if (ifa->prefix_len == ipv6_prfx[i].prefix_len && + ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix, + ifa->prefix_len)) + return 0; + } + } +#endif + return -ENOENT; +} + /* check if proposed prefixes match one of our device prefixes */ int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { struct dst_entry *dst = sk_dst_get(clcsock->sk); - int rc = -ENOENT; + int rc; if (!dst) { rc = -ENOTCONN; @@ -162,6 +232,8 @@ int smc_clc_prfx_match(struct socket *clcsock, rcu_read_lock(); if (!prop->ipv6_prefixes_cnt) rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + else + rc = smc_clc_prfx_match6_rcu(dst->dev, prop); rcu_read_unlock(); out_rel: dst_release(dst); @@ -288,21 +360,24 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport) { + struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_msg_proposal pclc; struct smc_clc_msg_trail trl; + int len, i, plen, rc; int reason_code = 0; - struct kvec vec[3]; + struct kvec vec[4]; struct msghdr msg; - int len, plen, rc; /* retrieve ip prefixes for CLC proposal msg */ - rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx); + rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx); if (rc) return SMC_CLC_DECL_CNFERR; /* configuration error */ /* send SMC Proposal CLC message */ - plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl); + plen = sizeof(pclc) + sizeof(pclc_prfx) + + (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) + + sizeof(trl); memset(&pclc, 0, sizeof(pclc)); memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); pclc.hdr.type = SMC_CLC_PROPOSAL; @@ -315,14 +390,20 @@ int smc_clc_send_proposal(struct smc_sock *smc, memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); memset(&msg, 0, sizeof(msg)); - vec[0].iov_base = &pclc; - vec[0].iov_len = sizeof(pclc); - vec[1].iov_base = &pclc_prfx; - vec[1].iov_len = sizeof(pclc_prfx); - vec[2].iov_base = &trl; - vec[2].iov_len = sizeof(trl); + i = 0; + vec[i].iov_base = &pclc; + vec[i++].iov_len = sizeof(pclc); + vec[i].iov_base = &pclc_prfx; + vec[i++].iov_len = sizeof(pclc_prfx); + if (pclc_prfx.ipv6_prefixes_cnt > 0) { + vec[i].iov_base = &ipv6_prfx[0]; + vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt * + sizeof(ipv6_prfx[0]); + } + vec[i].iov_base = &trl; + vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ - len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen); + len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); if (len < sizeof(pclc)) { if (len >= 0) { reason_code = -ENETUNREACH; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5ddb0d22eb8a..63bf1dc2c1f9 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -60,10 +60,15 @@ struct smc_clc_msg_local { /* header2 of clc messages */ u8 mac[6]; /* mac of ib_device port */ }; +#define SMC_CLC_MAX_V6_PREFIX 8 + +/* Struct would be 4 byte aligned, but it is used in an array that is sent + * to peers and must conform to RFC7609, hence we need to use packed here. + */ struct smc_clc_ipv6_prefix { - u8 prefix[4]; + struct in6_addr prefix; u8 prefix_len; -} __packed; +} __packed; /* format defined in RFC7609 */ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -79,9 +84,11 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ } __aligned(4); #define SMC_CLC_PROPOSAL_MAX_OFFSET 0x28 -#define SMC_CLC_PROPOSAL_MAX_PREFIX (8 * sizeof(struct smc_clc_ipv6_prefix)) +#define SMC_CLC_PROPOSAL_MAX_PREFIX (SMC_CLC_MAX_V6_PREFIX * \ + sizeof(struct smc_clc_ipv6_prefix)) #define SMC_CLC_MAX_LEN (sizeof(struct smc_clc_msg_proposal) + \ SMC_CLC_PROPOSAL_MAX_OFFSET + \ + sizeof(struct smc_clc_msg_proposal_prefix) + \ SMC_CLC_PROPOSAL_MAX_PREFIX + \ sizeof(struct smc_clc_msg_trail)) -- cgit v1.2.3-70-g09d2 From aaa4d33f6da1e1a415f4f7c299a97defd845ce7d Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Fri, 16 Mar 2018 15:06:41 +0100 Subject: net/smc: enable ipv6 support for smc Add ipv6 support to the smc socket layer functions. Make use of the updated clc layer functions to retrieve and match ipv6 information. The indicator for ipv4 or ipv6 is the protocol constant that is provided in the socket() call with address family AF_SMC. Based-on-patch-by: Takanori Ueda Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 64 ++++++++++++++++++++++++++++++++++++++++++-------------- net/smc/smc.h | 4 +++- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 949a2714a453..86913eb5cfa0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -7,12 +7,11 @@ * applicable with RoCE-cards only * * Initial restrictions: - * - IPv6 support postponed * - support for alternate links postponed * - partial support for non-blocking sockets only * - support for urgent data postponed * - * Copyright IBM Corp. 2016 + * Copyright IBM Corp. 2016, 2018 * * Author(s): Ursula Braun * based on prototype from Frank Blaschka @@ -64,6 +63,10 @@ static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; +static struct smc_hashinfo smc_v6_hashinfo = { + .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), +}; + int smc_hash_sk(struct sock *sk) { struct smc_hashinfo *h = sk->sk_prot->h.smc_hash; @@ -103,6 +106,18 @@ struct proto smc_proto = { }; EXPORT_SYMBOL_GPL(smc_proto); +struct proto smc_proto6 = { + .name = "SMC6", + .owner = THIS_MODULE, + .keepalive = smc_set_keepalive, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; +EXPORT_SYMBOL_GPL(smc_proto6); + static int smc_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -159,19 +174,22 @@ static void smc_destruct(struct sock *sk) sk_refcnt_debug_dec(sk); } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) { struct smc_sock *smc; + struct proto *prot; struct sock *sk; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0); + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); if (!sk) return NULL; sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; - sk->sk_protocol = SMCPROTO_SMC; + sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_LIST_HEAD(&smc->accept_q); @@ -198,10 +216,13 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out; rc = -EAFNOSUPPORT; + if (addr->sin_family != AF_INET && + addr->sin_family != AF_INET6 && + addr->sin_family != AF_UNSPEC) + goto out; /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */ - if ((addr->sin_family != AF_INET) && - ((addr->sin_family != AF_UNSPEC) || - (addr->sin_addr.s_addr != htonl(INADDR_ANY)))) + if (addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; lock_sock(sk); @@ -529,7 +550,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, /* separate smc parameter checking to be safe */ if (alen < sizeof(addr->sa_family)) goto out_err; - if (addr->sa_family != AF_INET) + if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6) goto out_err; lock_sock(sk); @@ -571,7 +592,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) int rc; release_sock(lsk); - new_sk = smc_sock_alloc(sock_net(lsk), NULL); + new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol); if (!new_sk) { rc = -ENOMEM; lsk->sk_err = ENOMEM; @@ -1367,6 +1388,7 @@ static const struct proto_ops smc_sock_ops = { static int smc_create(struct net *net, struct socket *sock, int protocol, int kern) { + int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; struct smc_sock *smc; struct sock *sk; int rc; @@ -1376,20 +1398,20 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, goto out; rc = -EPROTONOSUPPORT; - if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP)) + if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6) goto out; rc = -ENOBUFS; sock->ops = &smc_sock_ops; - sk = smc_sock_alloc(net, sock); + sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); smc->use_fallback = false; /* assume rdma capability first */ - rc = sock_create_kern(net, PF_INET, SOCK_STREAM, - IPPROTO_TCP, &smc->clcsock); + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); if (rc) { sk_common_release(sk); goto out; @@ -1429,16 +1451,23 @@ static int __init smc_init(void) rc = proto_register(&smc_proto, 1); if (rc) { - pr_err("%s: proto_register fails with %d\n", __func__, rc); + pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc); goto out_pnet; } + rc = proto_register(&smc_proto6, 1); + if (rc) { + pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc); + goto out_proto; + } + rc = sock_register(&smc_sock_family_ops); if (rc) { pr_err("%s: sock_register fails with %d\n", __func__, rc); - goto out_proto; + goto out_proto6; } INIT_HLIST_HEAD(&smc_v4_hashinfo.ht); + INIT_HLIST_HEAD(&smc_v6_hashinfo.ht); rc = smc_ib_register_client(); if (rc) { @@ -1451,6 +1480,8 @@ static int __init smc_init(void) out_sock: sock_unregister(PF_SMC); +out_proto6: + proto_unregister(&smc_proto6); out_proto: proto_unregister(&smc_proto); out_pnet: @@ -1475,6 +1506,7 @@ static void __exit smc_exit(void) static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); + proto_unregister(&smc_proto6); proto_unregister(&smc_proto); smc_pnet_exit(); } diff --git a/net/smc/smc.h b/net/smc/smc.h index 268cdf11533c..e4829a2f46ba 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -18,11 +18,13 @@ #include "smc_ib.h" -#define SMCPROTO_SMC 0 /* SMC protocol */ +#define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ +#define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_MAX_PORTS 2 /* Max # of ports */ extern struct proto smc_proto; +extern struct proto smc_proto6; #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 -- cgit v1.2.3-70-g09d2 From 7156d194a0772f733865267e7207e0b08f81b02b Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Fri, 16 Mar 2018 10:51:07 -0700 Subject: tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS that reports tcp_sock.snd_ssthresh. Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Priyaranjan Jha Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Yuchung Cheng Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 4c0ae0faf7ca..560374c978f9 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -243,6 +243,7 @@ enum { TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ TCP_NLA_SNDQ_SIZE, /* Data (bytes) pending in send queue */ TCP_NLA_CA_STATE, /* ca_state of socket */ + TCP_NLA_SND_SSTHRESH, /* Slow start size threshold */ }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fb350f740f69..e553f84bde83 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) u32 rate; stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + - 4 * nla_total_size(sizeof(u32)) + + 5 * nla_total_size(sizeof(u32)) + 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -3061,6 +3061,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); + nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); -- cgit v1.2.3-70-g09d2 From 53794570049d314742f156c99022914840a3d5ab Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Fri, 16 Mar 2018 10:51:49 -0700 Subject: net-tcp_bbr: set tp->snd_ssthresh to BDP upon STARTUP exit Set tp->snd_ssthresh to BDP upon STARTUP exit. This allows us to check if a BBR flow exited STARTUP and the BDP at the time of STARTUP exit with SCM_TIMESTAMPING_OPT_STATS. Since BBR does not use snd_ssthresh this fix has no impact on BBR's behavior. Signed-off-by: Yousuk Seung Signed-off-by: Neal Cardwell Signed-off-by: Priyaranjan Jha Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index c92014cb1e16..158d105e76da 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -731,6 +731,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) bbr->mode = BBR_DRAIN; /* drain queue we created */ bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ + tcp_sk(sk)->snd_ssthresh = + bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && tcp_packets_in_flight(tcp_sk(sk)) <= @@ -834,6 +836,7 @@ static void bbr_init(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->prior_cwnd = 0; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; bbr->next_rtt_delivered = 0; bbr->prev_ca_state = TCP_CA_Open; @@ -886,7 +889,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) static u32 bbr_ssthresh(struct sock *sk) { bbr_save_cwnd(sk); - return TCP_INFINITE_SSTHRESH; /* BBR does not use ssthresh */ + return tcp_sk(sk)->snd_ssthresh; } static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, -- cgit v1.2.3-70-g09d2 From 489b30b53f0540b9f8e391cbb2839cea48b5d1c1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:10:57 +0300 Subject: net: Convert l2tp_net_ops Init method is rather simple. Exit method queues del_work for every tunnel from per-net list. This seems to be safe to be marked async. Signed-off-by: Kirill Tkhai Acked-by: Guillaume Nault Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 83421c6f0bef..189a12a5e4ac 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1787,6 +1787,7 @@ static struct pernet_operations l2tp_net_ops = { .exit = l2tp_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), + .async = true, }; static int __init l2tp_init(void) -- cgit v1.2.3-70-g09d2 From 8cec2f49dc413d6328067d22862b0bdd0f4305ec Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:11:06 +0300 Subject: net: Convert mpls_net_ops These pernet_operations register and unregister sysctl table. Exit methods frees platform_labels from net::mpls::platform_label. Everything is per-net, and they looks safe to be marked async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7a4de6d618b1..d4a89a8be013 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2488,6 +2488,7 @@ static void mpls_net_exit(struct net *net) static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, + .async = true, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { -- cgit v1.2.3-70-g09d2 From ec716650a750334cd763024597159eea3569e207 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:11:16 +0300 Subject: net: Convert ovs_net_ops These pernet_operations initialize and destroy net_generic() data pointed by ovs_net_id. Exit method destroys vports from alive net to exiting net. Since they are only pernet_operations interested in this data, and exit method is executed under exclusive global lock (ovs_mutex), they are safe to be executed in parallel. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..100191df0371 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2384,6 +2384,7 @@ static struct pernet_operations ovs_net_ops = { .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), + .async = true, }; static int __init dp_init(void) -- cgit v1.2.3-70-g09d2 From 554855ccdf37262def7fed557a5efec39f2c0f47 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:11:25 +0300 Subject: net: Convert ipvs_core_ops These pernet_operations register and unregister nf hooks, /proc entries, sysctl, percpu statistics. There are several global lists, and the only list modified without exclusive locks is ip_vs_conn_tab in ip_vs_conn_flush(). We iterate the list and force the timers expire at the moment. Since there were possible several timer expirations before this patch, and since they are safe, the patch does not invent new parallelism of their destruction. These pernet_operations look safe to be converted. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5f6f73cf2174..c5d16e2bc8e2 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2289,6 +2289,7 @@ static struct pernet_operations ipvs_core_ops = { .exit = __ip_vs_cleanup, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), + .async = true, }; static struct pernet_operations ipvs_core_dev_ops = { -- cgit v1.2.3-70-g09d2 From d0edfbb4ba4a88399c169cf2c26b252d39f503bc Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:11:35 +0300 Subject: net: Convert ipvs_core_dev_ops Exit method stops two per-net threads and cancels delayed work. Everything looks nicely per-net divided. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c5d16e2bc8e2..6a6cb9db030b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2294,6 +2294,7 @@ static struct pernet_operations ipvs_core_ops = { static struct pernet_operations ipvs_core_dev_ops = { .exit = __ip_vs_dev_cleanup, + .async = true, }; /* -- cgit v1.2.3-70-g09d2 From 6c77e79557acd9b3b896a8075a19ef11ed887a99 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 15 Mar 2018 12:11:44 +0300 Subject: net: Convert ip_vs_ftp_ops These pernet_operations register and unregister ipvs app. register_ip_vs_app(), unregister_ip_vs_app() and register_ip_vs_app_inc() modify per-net structures, and there are no global structures touched. So, this looks safe to be marked as async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ftp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 58d5d05aec24..8b25aab41928 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -479,6 +479,7 @@ static void __ip_vs_ftp_exit(struct net *net) static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, + .async = true, }; static int __init ip_vs_ftp_init(void) -- cgit v1.2.3-70-g09d2 From 928df1880e24bcd47d6359ff86df24db3dfba3c3 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:51 +0100 Subject: tipc: obsolete TIPC_ZONE_SCOPE Publications for TIPC_CLUSTER_SCOPE and TIPC_ZONE_SCOPE are in all aspects handled the same way, both on the publishing node and on the receiving nodes. Despite previous ambitions to the contrary, this is never going to change, so we take the conseqeunce of this and obsolete TIPC_ZONE_SCOPE and related macros/functions. Whenever a user is doing a bind() or a sendmsg() attempt using ZONE_SCOPE we translate this internally to CLUSTER_SCOPE, while we remain compatible with users and remote nodes still using ZONE_SCOPE. Furthermore, the non-formalized scope value 0 has always been permitted for use during lookup, with the same meaning as ZONE_SCOPE/CLUSTER_SCOPE. We now permit it even as binding scope, but for compatibility reasons we choose to not change the value of TIPC_CLUSTER_SCOPE. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 102 ++++++++++++++++++++++++---------------------- net/tipc/addr.c | 31 -------------- net/tipc/addr.h | 10 +++++ net/tipc/msg.c | 2 +- net/tipc/name_table.c | 3 +- net/tipc/net.c | 2 +- net/tipc/socket.c | 15 ++++--- 7 files changed, 77 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 14bacc7e6cef..4ac9f1f02b06 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -61,50 +61,6 @@ struct tipc_name_seq { __u32 upper; }; -/* TIPC Address Size, Offset, Mask specification for Z.C.N - */ -#define TIPC_NODE_BITS 12 -#define TIPC_CLUSTER_BITS 12 -#define TIPC_ZONE_BITS 8 - -#define TIPC_NODE_OFFSET 0 -#define TIPC_CLUSTER_OFFSET TIPC_NODE_BITS -#define TIPC_ZONE_OFFSET (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS) - -#define TIPC_NODE_SIZE ((1UL << TIPC_NODE_BITS) - 1) -#define TIPC_CLUSTER_SIZE ((1UL << TIPC_CLUSTER_BITS) - 1) -#define TIPC_ZONE_SIZE ((1UL << TIPC_ZONE_BITS) - 1) - -#define TIPC_NODE_MASK (TIPC_NODE_SIZE << TIPC_NODE_OFFSET) -#define TIPC_CLUSTER_MASK (TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET) -#define TIPC_ZONE_MASK (TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET) - -#define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) - -static inline __u32 tipc_addr(unsigned int zone, - unsigned int cluster, - unsigned int node) -{ - return (zone << TIPC_ZONE_OFFSET) | - (cluster << TIPC_CLUSTER_OFFSET) | - node; -} - -static inline unsigned int tipc_zone(__u32 addr) -{ - return addr >> TIPC_ZONE_OFFSET; -} - -static inline unsigned int tipc_cluster(__u32 addr) -{ - return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET; -} - -static inline unsigned int tipc_node(__u32 addr) -{ - return addr & TIPC_NODE_MASK; -} - /* * Application-accessible port name types */ @@ -117,9 +73,10 @@ static inline unsigned int tipc_node(__u32 addr) /* * Publication scopes when binding port names and port name sequences */ -#define TIPC_ZONE_SCOPE 1 -#define TIPC_CLUSTER_SCOPE 2 -#define TIPC_NODE_SCOPE 3 +enum tipc_scope { + TIPC_CLUSTER_SCOPE = 2, /* 0 can also be used */ + TIPC_NODE_SCOPE = 3 +}; /* * Limiting values for messages @@ -243,7 +200,7 @@ struct sockaddr_tipc { struct tipc_group_req { __u32 type; /* group id */ __u32 instance; /* member id */ - __u32 scope; /* zone/cluster/node */ + __u32 scope; /* cluster/node */ __u32 flags; }; @@ -268,4 +225,53 @@ struct tipc_sioc_ln_req { __u32 bearer_id; char linkname[TIPC_MAX_LINK_NAME]; }; + + +/* The macros and functions below are deprecated: + */ + +#define TIPC_ZONE_SCOPE 1 + +#define TIPC_NODE_BITS 12 +#define TIPC_CLUSTER_BITS 12 +#define TIPC_ZONE_BITS 8 + +#define TIPC_NODE_OFFSET 0 +#define TIPC_CLUSTER_OFFSET TIPC_NODE_BITS +#define TIPC_ZONE_OFFSET (TIPC_CLUSTER_OFFSET + TIPC_CLUSTER_BITS) + +#define TIPC_NODE_SIZE ((1UL << TIPC_NODE_BITS) - 1) +#define TIPC_CLUSTER_SIZE ((1UL << TIPC_CLUSTER_BITS) - 1) +#define TIPC_ZONE_SIZE ((1UL << TIPC_ZONE_BITS) - 1) + +#define TIPC_NODE_MASK (TIPC_NODE_SIZE << TIPC_NODE_OFFSET) +#define TIPC_CLUSTER_MASK (TIPC_CLUSTER_SIZE << TIPC_CLUSTER_OFFSET) +#define TIPC_ZONE_MASK (TIPC_ZONE_SIZE << TIPC_ZONE_OFFSET) + +#define TIPC_ZONE_CLUSTER_MASK (TIPC_ZONE_MASK | TIPC_CLUSTER_MASK) + +static inline __u32 tipc_addr(unsigned int zone, + unsigned int cluster, + unsigned int node) +{ + return (zone << TIPC_ZONE_OFFSET) | + (cluster << TIPC_CLUSTER_OFFSET) | + node; +} + +static inline unsigned int tipc_zone(__u32 addr) +{ + return addr >> TIPC_ZONE_OFFSET; +} + +static inline unsigned int tipc_cluster(__u32 addr) +{ + return (addr & TIPC_CLUSTER_MASK) >> TIPC_CLUSTER_OFFSET; +} + +static inline unsigned int tipc_node(__u32 addr) +{ + return addr & TIPC_NODE_MASK; +} + #endif diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 48fd3b5a73fb..97cd857d7f43 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -63,23 +63,6 @@ int in_own_node(struct net *net, u32 addr) return (addr == tn->own_addr) || !addr; } -/** - * addr_domain - convert 2-bit scope value to equivalent message lookup domain - * - * Needed when address of a named message must be looked up a second time - * after a network hop. - */ -u32 addr_domain(struct net *net, u32 sc) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - if (likely(sc == TIPC_NODE_SCOPE)) - return tn->own_addr; - if (sc == TIPC_CLUSTER_SCOPE) - return tipc_cluster_mask(tn->own_addr); - return tipc_zone_mask(tn->own_addr); -} - /** * tipc_addr_domain_valid - validates a network domain address * @@ -124,20 +107,6 @@ int tipc_in_scope(u32 domain, u32 addr) return 0; } -/** - * tipc_addr_scope - convert message lookup domain to a 2-bit scope value - */ -int tipc_addr_scope(u32 domain) -{ - if (likely(!domain)) - return TIPC_ZONE_SCOPE; - if (tipc_node(domain)) - return TIPC_NODE_SCOPE; - if (tipc_cluster(domain)) - return TIPC_CLUSTER_SCOPE; - return TIPC_ZONE_SCOPE; -} - char *tipc_addr_string_fill(char *string, u32 addr) { snprintf(string, 16, "<%u.%u.%u>", diff --git a/net/tipc/addr.h b/net/tipc/addr.h index bebb347803ce..2ecf5a5d40dd 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -60,6 +60,16 @@ static inline u32 tipc_cluster_mask(u32 addr) return addr & TIPC_ZONE_CLUSTER_MASK; } +static inline int tipc_node2scope(u32 node) +{ + return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE; +} + +static inline int tipc_scope2node(struct net *net, int sc) +{ + return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); +} + u32 tipc_own_addr(struct net *net); int in_own_cluster(struct net *net, u32 addr); int in_own_cluster_exact(struct net *net, u32 addr); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4e1c6f6450bb..b6c45dccba3d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -580,7 +580,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg = buf_msg(skb); if (msg_reroute_cnt(msg)) return false; - dnode = addr_domain(net, msg_lookup_scope(msg)); + dnode = tipc_scope2node(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), msg_nameinst(msg), &dnode); if (!dport) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e01c9c691ba2..6772390fcb00 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -473,8 +473,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, struct name_seq *seq = nametbl_find_seq(net, type); int index = hash(type); - if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) || - (lower > upper)) { + if (scope > TIPC_NODE_SCOPE || lower > upper) { pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n", type, lower, upper, scope); return NULL; diff --git a/net/tipc/net.c b/net/tipc/net.c index 1a2fde0d6f61..5c4c4405b78e 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -118,7 +118,7 @@ int tipc_net_start(struct net *net, u32 addr) tipc_sk_reinit(net); tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, - TIPC_ZONE_SCOPE, 0, tn->own_addr); + TIPC_CLUSTER_SCOPE, 0, tn->own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8b04e601311c..910d3827f499 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -644,7 +644,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, goto exit; } - res = (addr->scope > 0) ? + res = (addr->scope >= 0) ? tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) : tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq); exit: @@ -1280,8 +1280,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 type, inst, domain; u32 dnode, dport; + u32 type, inst; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1332,13 +1332,12 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (dest->addrtype == TIPC_ADDR_NAME) { type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; - domain = dest->addr.name.domain; - dnode = domain; + dnode = dest->addr.name.domain; msg_set_type(hdr, TIPC_NAMED_MSG); msg_set_hdr_sz(hdr, NAMED_H_SIZE); msg_set_nametype(hdr, type); msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_addr_scope(domain)); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); msg_set_destnode(hdr, dnode); msg_set_destport(hdr, dport); @@ -2592,6 +2591,9 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct publication *publ; u32 key; + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + if (tipc_sk_connected(sk)) return -EINVAL; key = tsk->portid + tsk->pub_count + 1; @@ -2617,6 +2619,9 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct publication *safe; int rc = -EINVAL; + if (scope != TIPC_NODE_SCOPE) + scope = TIPC_CLUSTER_SCOPE; + list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { if (seq) { if (publ->scope != scope) -- cgit v1.2.3-70-g09d2 From 64a52b26d5633d6efc35cdf1e0c627cc4189e55a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:52 +0100 Subject: tipc: remove zone publication list in name table As a consequence of the previous commit we nan now eliminate zone scope related lists in the name table. We start with name_table::publ_list[3], which can now be replaced with two lists, one for node scope publications and one for cluster scope publications. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.h | 5 +++++ net/tipc/name_distr.c | 39 ++++++++++++++++++--------------------- net/tipc/name_table.c | 5 ++--- net/tipc/name_table.h | 6 ++++-- 4 files changed, 29 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index ff8b071654f5..347f850dc872 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -131,6 +131,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline struct name_table *tipc_name_table(struct net *net) +{ + return tipc_net(net)->nametbl; +} + static inline struct tipc_topsrv *tipc_topsrv(struct net *net) { return tipc_net(net)->topsrv; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 23f8899e0f8c..11ce20505550 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -86,25 +86,25 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff *buf; + struct name_table *nt = tipc_name_table(net); struct distr_item *item; + struct sk_buff *skb; - list_add_tail_rcu(&publ->local_list, - &tn->nametbl->publ_list[publ->scope]); - - if (publ->scope == TIPC_NODE_SCOPE) + if (publ->scope == TIPC_NODE_SCOPE) { + list_add_tail_rcu(&publ->local_list, &nt->node_scope); return NULL; + } + list_add_tail_rcu(&publ->local_list, &nt->cluster_scope); - buf = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); - if (!buf) { + skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); + if (!skb) { pr_warn("Publication distribution failure\n"); return NULL; } - item = (struct distr_item *)msg_data(buf_msg(buf)); + item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, publ); - return buf; + return skb; } /** @@ -184,16 +184,13 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, */ void tipc_named_node_up(struct net *net, u32 dnode) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); struct sk_buff_head head; __skb_queue_head_init(&head); rcu_read_lock(); - named_distribute(net, &head, dnode, - &tn->nametbl->publ_list[TIPC_CLUSTER_SCOPE]); - named_distribute(net, &head, dnode, - &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); + named_distribute(net, &head, dnode, &nt->cluster_scope); rcu_read_unlock(); tipc_node_xmit(net, &head, dnode, 0); @@ -382,16 +379,16 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) */ void tipc_named_reinit(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); struct publication *publ; - int scope; spin_lock_bh(&tn->nametbl_lock); - for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++) - list_for_each_entry_rcu(publ, &tn->nametbl->publ_list[scope], - local_list) - publ->node = tn->own_addr; + list_for_each_entry_rcu(publ, &nt->node_scope, local_list) + publ->node = tn->own_addr; + list_for_each_entry_rcu(publ, &nt->cluster_scope, local_list) + publ->node = tn->own_addr; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 6772390fcb00..1a3a327018c5 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -878,9 +878,8 @@ int tipc_nametbl_init(struct net *net) for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]); - INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_NODE_SCOPE]); + INIT_LIST_HEAD(&tipc_nametbl->node_scope); + INIT_LIST_HEAD(&tipc_nametbl->cluster_scope); tn->nametbl = tipc_nametbl; spin_lock_init(&tn->nametbl_lock); return 0; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 17652602d5e2..47f72cddfb39 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -88,12 +88,14 @@ struct publication { /** * struct name_table - table containing all existing port name publications * @seq_hlist: name sequence hash lists - * @publ_list: pulication lists + * @node_scope: all local publications with node scope + * @cluster_scope: all local publications with cluster scope * @local_publ_count: number of publications issued by this node */ struct name_table { struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE]; - struct list_head publ_list[TIPC_PUBL_SCOPE_NUM]; + struct list_head node_scope; + struct list_head cluster_scope; u32 local_publ_count; }; -- cgit v1.2.3-70-g09d2 From ba765ec63786583e210b55073a908a9d7ea284fa Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:53 +0100 Subject: tipc: remove zone_list member in struct publication As a further consequence of the previous commits, we can also remove the member 'zone_list 'in struct name_info and struct publication. Instead, we now let the member cluster_list take over the role a container of all publications of a given . We also remove the counters for the size of those lists, since they don't serve any purpose. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 101 ++++++++++++++------------------------------------ net/tipc/name_table.h | 5 +-- 2 files changed, 30 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 1a3a327018c5..6d7b4c7ff4b7 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -50,24 +50,12 @@ /** * struct name_info - name sequence publication info - * @node_list: circular list of publications made by own node - * @cluster_list: circular list of publications made by own cluster - * @zone_list: circular list of publications made by own zone - * @node_list_size: number of entries in "node_list" - * @cluster_list_size: number of entries in "cluster_list" - * @zone_list_size: number of entries in "zone_list" - * - * Note: The zone list always contains at least one entry, since all - * publications of the associated name sequence belong to it. - * (The cluster and node lists may be empty.) + * @node_list: list of publications on own node of this + * @cluster_list: list of all publications of this */ struct name_info { struct list_head node_list; struct list_head cluster_list; - struct list_head zone_list; - u32 node_list_size; - u32 cluster_list_size; - u32 zone_list_size; }; /** @@ -249,7 +237,7 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, info = sseq->info; /* Check if an identical publication already exists */ - list_for_each_entry(publ, &info->zone_list, zone_list) { + list_for_each_entry(publ, &info->cluster_list, cluster_list) { if ((publ->ref == port) && (publ->key == key) && (!publ->node || (publ->node == node))) return NULL; @@ -292,7 +280,6 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, INIT_LIST_HEAD(&info->node_list); INIT_LIST_HEAD(&info->cluster_list); - INIT_LIST_HEAD(&info->zone_list); /* Insert new sub-sequence */ sseq = &nseq->sseqs[inspos]; @@ -311,18 +298,10 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, if (!publ) return NULL; - list_add(&publ->zone_list, &info->zone_list); - info->zone_list_size++; - - if (in_own_cluster(net, node)) { - list_add(&publ->cluster_list, &info->cluster_list); - info->cluster_list_size++; - } + list_add(&publ->cluster_list, &info->cluster_list); - if (in_own_node(net, node)) { + if (in_own_node(net, node)) list_add(&publ->node_list, &info->node_list); - info->node_list_size++; - } /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { @@ -363,7 +342,7 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net, info = sseq->info; /* Locate publication, if it exists */ - list_for_each_entry(publ, &info->zone_list, zone_list) { + list_for_each_entry(publ, &info->cluster_list, cluster_list) { if ((publ->key == key) && (publ->ref == ref) && (!publ->node || (publ->node == node))) goto found; @@ -371,24 +350,12 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net, return NULL; found: - /* Remove publication from zone scope list */ - list_del(&publ->zone_list); - info->zone_list_size--; - - /* Remove publication from cluster scope list, if present */ - if (in_own_cluster(net, node)) { - list_del(&publ->cluster_list); - info->cluster_list_size--; - } - - /* Remove publication from node scope list, if present */ - if (in_own_node(net, node)) { + list_del(&publ->cluster_list); + if (in_own_node(net, node)) list_del(&publ->node_list); - info->node_list_size--; - } /* Contract subseq list if no more publications for that subseq */ - if (list_empty(&info->zone_list)) { + if (list_empty(&info->cluster_list)) { kfree(info); free = &nseq->sseqs[nseq->first_free--]; memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq)); @@ -435,7 +402,8 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, struct name_info *info = sseq->info; int must_report = 1; - list_for_each_entry(crs, &info->zone_list, zone_list) { + list_for_each_entry(crs, &info->cluster_list, + cluster_list) { tipc_sub_report_overlap(sub, sseq->lower, sseq->upper, TIPC_PUBLISHED, @@ -559,18 +527,12 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, node_list); list_move_tail(&publ->node_list, &info->node_list); - } else if (!list_empty(&info->cluster_list)) { + } else { publ = list_first_entry(&info->cluster_list, struct publication, cluster_list); list_move_tail(&publ->cluster_list, &info->cluster_list); - } else { - publ = list_first_entry(&info->zone_list, - struct publication, - zone_list); - list_move_tail(&publ->zone_list, - &info->zone_list); } } @@ -581,16 +543,10 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, publ = list_first_entry(&info->node_list, struct publication, node_list); list_move_tail(&publ->node_list, &info->node_list); - } else if (in_own_cluster_exact(net, *destnode)) { - if (list_empty(&info->cluster_list)) - goto no_match; + } else { publ = list_first_entry(&info->cluster_list, struct publication, cluster_list); list_move_tail(&publ->cluster_list, &info->cluster_list); - } else { - publ = list_first_entry(&info->zone_list, struct publication, - zone_list); - list_move_tail(&publ->zone_list, &info->zone_list); } ref = publ->ref; @@ -622,7 +578,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, sseq = nameseq_find_subseq(seq, instance); if (likely(sseq)) { info = sseq->info; - list_for_each_entry(publ, &info->zone_list, zone_list) { + list_for_each_entry(publ, &info->cluster_list, cluster_list) { if (publ->scope != scope) continue; if (publ->ref == exclude && publ->node == self) @@ -631,7 +587,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, (*dstcnt)++; if (all) continue; - list_move_tail(&publ->zone_list, &info->zone_list); + list_move_tail(&publ->cluster_list, + &info->cluster_list); break; } } @@ -641,15 +598,14 @@ exit: return !list_empty(dsts); } -int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports) +void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports) { struct sub_seq *sseq_stop; struct name_info *info; struct publication *p; struct name_seq *seq; struct sub_seq *sseq; - int res = 0; rcu_read_lock(); seq = nametbl_find_seq(net, type); @@ -667,14 +623,10 @@ int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, if (p->scope == scope || (!exact && p->scope < scope)) tipc_dest_push(dports, 0, p->ref); } - - if (info->cluster_list_size != info->node_list_size) - res = 1; } spin_unlock_bh(&seq->lock); exit: rcu_read_unlock(); - return res; } /* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes @@ -699,7 +651,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, stop = seq->sseqs + seq->first_free; for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; - list_for_each_entry(publ, &info->zone_list, zone_list) { + list_for_each_entry(publ, &info->cluster_list, cluster_list) { tipc_nlist_add(nodes, publ->node); } } @@ -728,7 +680,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, stop = seq->sseqs + seq->first_free; for (; sseq != stop; sseq++) { info = sseq->info; - list_for_each_entry(p, &info->zone_list, zone_list) { + list_for_each_entry(p, &info->cluster_list, cluster_list) { if (p->scope != scope) continue; tipc_group_add_member(grp, p->node, p->ref, p->lower); @@ -899,7 +851,8 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq) spin_lock_bh(&seq->lock); sseq = seq->sseqs; info = sseq->info; - list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) { + list_for_each_entry_safe(publ, safe, &info->cluster_list, + cluster_list) { tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, publ->ref, publ->key); kfree_rcu(publ, rcu); @@ -948,17 +901,19 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct publication *p; if (*last_publ) { - list_for_each_entry(p, &sseq->info->zone_list, zone_list) + list_for_each_entry(p, &sseq->info->cluster_list, + cluster_list) if (p->key == *last_publ) break; if (p->key != *last_publ) return -EPIPE; } else { - p = list_first_entry(&sseq->info->zone_list, struct publication, - zone_list); + p = list_first_entry(&sseq->info->cluster_list, + struct publication, + cluster_list); } - list_for_each_entry_from(p, &sseq->info->zone_list, zone_list) { + list_for_each_entry_from(p, &sseq->info->cluster_list, cluster_list) { *last_publ = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 47f72cddfb39..a9063e25ee74 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -81,7 +81,6 @@ struct publication { struct list_head pport_list; struct list_head node_list; struct list_head cluster_list; - struct list_head zone_list; struct rcu_head rcu; }; @@ -102,8 +101,8 @@ struct name_table { int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); -int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, - u32 scope, bool exact, struct list_head *dports); +void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, + u32 scope, bool exact, struct list_head *dports); void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, -- cgit v1.2.3-70-g09d2 From 935439cc48ef24f0e50396be3684a0f27e609363 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:54 +0100 Subject: tipc: merge two lists in struct publication The size of struct publication can be reduced further. Membership in lists 'nodesub_list' and 'local_list' is mutually exlusive, in that remote publications use the former and local publications the latter. We replace the two lists with one single, named 'binding_node' which reflects what it really is. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 20 ++++++++++---------- net/tipc/name_table.h | 5 ++--- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 11ce20505550..4c54fb37875a 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -91,10 +91,10 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) struct sk_buff *skb; if (publ->scope == TIPC_NODE_SCOPE) { - list_add_tail_rcu(&publ->local_list, &nt->node_scope); + list_add_tail_rcu(&publ->binding_node, &nt->node_scope); return NULL; } - list_add_tail_rcu(&publ->local_list, &nt->cluster_scope); + list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { @@ -115,7 +115,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) struct sk_buff *buf; struct distr_item *item; - list_del(&publ->local_list); + list_del(&publ->binding_node); if (publ->scope == TIPC_NODE_SCOPE) return NULL; @@ -147,7 +147,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; - list_for_each_entry(publ, pls, local_list) { + list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, @@ -211,7 +211,7 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->node, publ->ref, publ->key); if (p) - tipc_node_unsubscribe(net, &p->nodesub_list, addr); + tipc_node_unsubscribe(net, &p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (p != publ) { @@ -246,7 +246,7 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) { struct publication *publ, *tmp; - list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list) + list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); tipc_dist_queue_purge(net, addr); } @@ -270,7 +270,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, TIPC_CLUSTER_SCOPE, node, ntohl(i->ref), ntohl(i->key)); if (publ) { - tipc_node_subscribe(net, &publ->nodesub_list, node); + tipc_node_subscribe(net, &publ->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { @@ -279,7 +279,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, node, ntohl(i->ref), ntohl(i->key)); if (publ) { - tipc_node_unsubscribe(net, &publ->nodesub_list, node); + tipc_node_unsubscribe(net, &publ->binding_node, node); kfree_rcu(publ, rcu); return true; } @@ -385,9 +385,9 @@ void tipc_named_reinit(struct net *net) spin_lock_bh(&tn->nametbl_lock); - list_for_each_entry_rcu(publ, &nt->node_scope, local_list) + list_for_each_entry_rcu(publ, &nt->node_scope, binding_node) publ->node = tn->own_addr; - list_for_each_entry_rcu(publ, &nt->cluster_scope, local_list) + list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) publ->node = tn->own_addr; spin_unlock_bh(&tn->nametbl_lock); diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index a9063e25ee74..cb16bd883565 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -1,7 +1,7 @@ /* * net/tipc/name_table.h: Include file for TIPC name table code * - * Copyright (c) 2000-2006, 2014-2015, Ericsson AB + * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -76,8 +76,7 @@ struct publication { u32 node; u32 ref; u32 key; - struct list_head nodesub_list; - struct list_head local_list; + struct list_head binding_node; struct list_head pport_list; struct list_head node_list; struct list_head cluster_list; -- cgit v1.2.3-70-g09d2 From e50e73e10757ac86fcb1aaa986055049e060727a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 15 Mar 2018 16:48:55 +0100 Subject: tipc: some name changes We rename some lists and fields in struct publication both to make the naming more consistent and to better reflect their roles. We also update the descriptions of those lists. node_list -> local_publ cluster_list -> all_publ pport_list -> binding_sock ref -> port There are no functional changes in this commit. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 12 ++--- net/tipc/name_distr.h | 2 +- net/tipc/name_table.c | 143 ++++++++++++++++++++++++-------------------------- net/tipc/name_table.h | 38 ++++++++------ net/tipc/socket.c | 14 ++--- 5 files changed, 106 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 4c54fb37875a..28d095a7d8bb 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -56,7 +56,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p) i->type = htonl(p->type); i->lower = htonl(p->lower); i->upper = htonl(p->upper); - i->ref = htonl(p->ref); + i->port = htonl(p->port); i->key = htonl(p->key); } @@ -209,15 +209,15 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, - publ->node, publ->ref, publ->key); + publ->node, publ->port, publ->key); if (p) tipc_node_unsubscribe(net, &p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (p != publ) { pr_err("Unable to remove publication from failed node\n" - " (type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n", - publ->type, publ->lower, publ->node, publ->ref, + " (type=%u, lower=%u, node=0x%x, port=%u, key=%u)\n", + publ->type, publ->lower, publ->node, publ->port, publ->key); } @@ -268,7 +268,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, ntohl(i->lower), ntohl(i->upper), TIPC_CLUSTER_SCOPE, node, - ntohl(i->ref), ntohl(i->key)); + ntohl(i->port), ntohl(i->key)); if (publ) { tipc_node_subscribe(net, &publ->binding_node, node); return true; @@ -276,7 +276,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, } else if (dtype == WITHDRAWAL) { publ = tipc_nametbl_remove_publ(net, ntohl(i->type), ntohl(i->lower), - node, ntohl(i->ref), + node, ntohl(i->port), ntohl(i->key)); if (publ) { tipc_node_unsubscribe(net, &publ->binding_node, node); diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 1264ba0af937..4753e628d7c4 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -63,7 +63,7 @@ struct distr_item { __be32 type; __be32 lower; __be32 upper; - __be32 ref; + __be32 port; __be32 key; }; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 6d7b4c7ff4b7..bbbfc0702634 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -1,7 +1,7 @@ /* * net/tipc/name_table.c: TIPC name table code * - * Copyright (c) 2000-2006, 2014-2015, Ericsson AB + * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * All rights reserved. * @@ -51,11 +51,11 @@ /** * struct name_info - name sequence publication info * @node_list: list of publications on own node of this - * @cluster_list: list of all publications of this + * @all_publ: list of all publications of this */ struct name_info { - struct list_head node_list; - struct list_head cluster_list; + struct list_head local_publ; + struct list_head all_publ; }; /** @@ -102,7 +102,7 @@ static int hash(int x) * publ_create - create a publication structure */ static struct publication *publ_create(u32 type, u32 lower, u32 upper, - u32 scope, u32 node, u32 port_ref, + u32 scope, u32 node, u32 port, u32 key) { struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC); @@ -116,9 +116,9 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper, publ->upper = upper; publ->scope = scope; publ->node = node; - publ->ref = port_ref; + publ->port = port; publ->key = key; - INIT_LIST_HEAD(&publ->pport_list); + INIT_LIST_HEAD(&publ->binding_sock); return publ; } @@ -237,9 +237,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, info = sseq->info; /* Check if an identical publication already exists */ - list_for_each_entry(publ, &info->cluster_list, cluster_list) { - if ((publ->ref == port) && (publ->key == key) && - (!publ->node || (publ->node == node))) + list_for_each_entry(publ, &info->all_publ, all_publ) { + if (publ->port == port && publ->key == key && + (!publ->node || publ->node == node)) return NULL; } } else { @@ -278,8 +278,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, return NULL; } - INIT_LIST_HEAD(&info->node_list); - INIT_LIST_HEAD(&info->cluster_list); + INIT_LIST_HEAD(&info->local_publ); + INIT_LIST_HEAD(&info->all_publ); /* Insert new sub-sequence */ sseq = &nseq->sseqs[inspos]; @@ -298,15 +298,15 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, if (!publ) return NULL; - list_add(&publ->cluster_list, &info->cluster_list); + list_add(&publ->all_publ, &info->all_publ); if (in_own_node(net, node)) - list_add(&publ->node_list, &info->node_list); + list_add(&publ->local_publ, &info->local_publ); /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_sub_report_overlap(s, publ->lower, publ->upper, - TIPC_PUBLISHED, publ->ref, + TIPC_PUBLISHED, publ->port, publ->node, publ->scope, created_subseq); } @@ -327,7 +327,7 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net, static struct publication *tipc_nameseq_remove_publ(struct net *net, struct name_seq *nseq, u32 inst, u32 node, - u32 ref, u32 key) + u32 port, u32 key) { struct publication *publ; struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); @@ -342,20 +342,20 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net, info = sseq->info; /* Locate publication, if it exists */ - list_for_each_entry(publ, &info->cluster_list, cluster_list) { - if ((publ->key == key) && (publ->ref == ref) && - (!publ->node || (publ->node == node))) + list_for_each_entry(publ, &info->all_publ, all_publ) { + if (publ->key == key && publ->port == port && + (!publ->node || publ->node == node)) goto found; } return NULL; found: - list_del(&publ->cluster_list); + list_del(&publ->all_publ); if (in_own_node(net, node)) - list_del(&publ->node_list); + list_del(&publ->local_publ); /* Contract subseq list if no more publications for that subseq */ - if (list_empty(&info->cluster_list)) { + if (list_empty(&info->all_publ)) { kfree(info); free = &nseq->sseqs[nseq->first_free--]; memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq)); @@ -365,8 +365,9 @@ found: /* Notify any waiting subscriptions */ list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { tipc_sub_report_overlap(s, publ->lower, publ->upper, - TIPC_WITHDRAWN, publ->ref, publ->node, - publ->scope, removed_subseq); + TIPC_WITHDRAWN, publ->port, + publ->node, publ->scope, + removed_subseq); } return publ; @@ -402,12 +403,12 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, struct name_info *info = sseq->info; int must_report = 1; - list_for_each_entry(crs, &info->cluster_list, - cluster_list) { + list_for_each_entry(crs, &info->all_publ, all_publ) { tipc_sub_report_overlap(sub, sseq->lower, sseq->upper, TIPC_PUBLISHED, - crs->ref, crs->node, + crs->port, + crs->node, crs->scope, must_report); must_report = 0; @@ -460,7 +461,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, } struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 ref, + u32 lower, u32 node, u32 port, u32 key) { struct publication *publ; @@ -470,7 +471,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, return NULL; spin_lock_bh(&seq->lock); - publ = tipc_nameseq_remove_publ(net, seq, lower, node, ref, key); + publ = tipc_nameseq_remove_publ(net, seq, lower, node, port, key); if (!seq->first_free && list_empty(&seq->subscriptions)) { hlist_del_init_rcu(&seq->ns_list); kfree(seq->sseqs); @@ -503,7 +504,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, struct name_info *info; struct publication *publ; struct name_seq *seq; - u32 ref = 0; + u32 port = 0; u32 node = 0; if (!tipc_in_scope(*destnode, tn->own_addr)) @@ -521,42 +522,42 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, /* Closest-First Algorithm */ if (likely(!*destnode)) { - if (!list_empty(&info->node_list)) { - publ = list_first_entry(&info->node_list, + if (!list_empty(&info->local_publ)) { + publ = list_first_entry(&info->local_publ, struct publication, - node_list); - list_move_tail(&publ->node_list, - &info->node_list); + local_publ); + list_move_tail(&publ->local_publ, + &info->local_publ); } else { - publ = list_first_entry(&info->cluster_list, + publ = list_first_entry(&info->all_publ, struct publication, - cluster_list); - list_move_tail(&publ->cluster_list, - &info->cluster_list); + all_publ); + list_move_tail(&publ->all_publ, + &info->all_publ); } } /* Round-Robin Algorithm */ else if (*destnode == tn->own_addr) { - if (list_empty(&info->node_list)) + if (list_empty(&info->local_publ)) goto no_match; - publ = list_first_entry(&info->node_list, struct publication, - node_list); - list_move_tail(&publ->node_list, &info->node_list); + publ = list_first_entry(&info->local_publ, struct publication, + local_publ); + list_move_tail(&publ->local_publ, &info->local_publ); } else { - publ = list_first_entry(&info->cluster_list, struct publication, - cluster_list); - list_move_tail(&publ->cluster_list, &info->cluster_list); + publ = list_first_entry(&info->all_publ, struct publication, + all_publ); + list_move_tail(&publ->all_publ, &info->all_publ); } - ref = publ->ref; + port = publ->port; node = publ->node; no_match: spin_unlock_bh(&seq->lock); not_found: rcu_read_unlock(); *destnode = node; - return ref; + return port; } bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, @@ -578,17 +579,16 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, sseq = nameseq_find_subseq(seq, instance); if (likely(sseq)) { info = sseq->info; - list_for_each_entry(publ, &info->cluster_list, cluster_list) { + list_for_each_entry(publ, &info->all_publ, all_publ) { if (publ->scope != scope) continue; - if (publ->ref == exclude && publ->node == self) + if (publ->port == exclude && publ->node == self) continue; - tipc_dest_push(dsts, publ->node, publ->ref); + tipc_dest_push(dsts, publ->node, publ->port); (*dstcnt)++; if (all) continue; - list_move_tail(&publ->cluster_list, - &info->cluster_list); + list_move_tail(&publ->all_publ, &info->all_publ); break; } } @@ -619,9 +619,9 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, if (sseq->lower > upper) break; info = sseq->info; - list_for_each_entry(p, &info->node_list, node_list) { + list_for_each_entry(p, &info->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) - tipc_dest_push(dports, 0, p->ref); + tipc_dest_push(dports, 0, p->port); } } spin_unlock_bh(&seq->lock); @@ -651,7 +651,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, stop = seq->sseqs + seq->first_free; for (; sseq != stop && sseq->lower <= upper; sseq++) { info = sseq->info; - list_for_each_entry(publ, &info->cluster_list, cluster_list) { + list_for_each_entry(publ, &info->all_publ, all_publ) { tipc_nlist_add(nodes, publ->node); } } @@ -680,10 +680,10 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, stop = seq->sseqs + seq->first_free; for (; sseq != stop; sseq++) { info = sseq->info; - list_for_each_entry(p, &info->cluster_list, cluster_list) { + list_for_each_entry(p, &info->all_publ, all_publ) { if (p->scope != scope) continue; - tipc_group_add_member(grp, p->node, p->ref, p->lower); + tipc_group_add_member(grp, p->node, p->port, p->lower); } } spin_unlock_bh(&seq->lock); @@ -728,7 +728,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, /** * tipc_nametbl_withdraw - withdraw name publication from network name tables */ -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, +int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port, u32 key) { struct publication *publ; @@ -737,18 +737,18 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, spin_lock_bh(&tn->nametbl_lock); publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr, - ref, key); + port, key); if (likely(publ)) { tn->nametbl->local_publ_count--; skb = tipc_named_withdraw(net, publ); /* Any pending external events? */ tipc_named_process_backlog(net); - list_del_init(&publ->pport_list); + list_del_init(&publ->binding_sock); kfree_rcu(publ, rcu); } else { pr_err("Unable to remove local publication\n" - "(type=%u, lower=%u, ref=%u, key=%u)\n", - type, lower, ref, key); + "(type=%u, lower=%u, port=%u, key=%u)\n", + type, lower, port, key); } spin_unlock_bh(&tn->nametbl_lock); @@ -851,10 +851,9 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq) spin_lock_bh(&seq->lock); sseq = seq->sseqs; info = sseq->info; - list_for_each_entry_safe(publ, safe, &info->cluster_list, - cluster_list) { + list_for_each_entry_safe(publ, safe, &info->all_publ, all_publ) { tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, - publ->ref, publ->key); + publ->port, publ->key); kfree_rcu(publ, rcu); } hlist_del_init_rcu(&seq->ns_list); @@ -901,19 +900,17 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct publication *p; if (*last_publ) { - list_for_each_entry(p, &sseq->info->cluster_list, - cluster_list) + list_for_each_entry(p, &sseq->info->all_publ, all_publ) if (p->key == *last_publ) break; if (p->key != *last_publ) return -EPIPE; } else { - p = list_first_entry(&sseq->info->cluster_list, - struct publication, - cluster_list); + p = list_first_entry(&sseq->info->all_publ, struct publication, + all_publ); } - list_for_each_entry_from(p, &sseq->info->cluster_list, cluster_list) { + list_for_each_entry_from(p, &sseq->info->all_publ, all_publ) { *last_publ = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, @@ -940,7 +937,7 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->ref)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->port)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index cb16bd883565..34a4ccb907aa 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -54,19 +54,22 @@ struct tipc_group; * @type: name sequence type * @lower: name sequence lower bound * @upper: name sequence upper bound - * @scope: scope of publication - * @node: network address of publishing port's node - * @ref: publishing port - * @key: publication key - * @nodesub_list: subscription to "node down" event (off-node publication only) - * @local_list: adjacent entries in list of publications made by this node - * @pport_list: adjacent entries in list of publications made by this port - * @node_list: adjacent matching name seq publications with >= node scope - * @cluster_list: adjacent matching name seq publications with >= cluster scope - * @zone_list: adjacent matching name seq publications with >= zone scope + * @scope: scope of publication, TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE + * @node: network address of publishing socket's node + * @port: publishing port + * @key: publication key, unique across the cluster + * @binding_node: all publications from the same node which bound this one + * - Remote publications: in node->publ_list + * Used by node/name distr to withdraw publications when node is lost + * - Local/node scope publications: in name_table->node_scope list + * - Local/cluster scope publications: in name_table->cluster_scope list + * @binding_sock: all publications from the same socket which bound this one + * Used by socket to withdraw publications when socket is unbound/released + * @local_publ: list of identical publications made from this node + * Used by closest_first and multicast receive lookup algorithms + * @all_publ: all publications identical to this one, whatever node and scope + * Used by round-robin lookup algorithm * @rcu: RCU callback head used for deferred freeing - * - * Note that the node list, cluster list, and zone list are circular lists. */ struct publication { u32 type; @@ -74,12 +77,12 @@ struct publication { u32 upper; u32 scope; u32 node; - u32 ref; + u32 port; u32 key; struct list_head binding_node; - struct list_head pport_list; - struct list_head node_list; - struct list_head cluster_list; + struct list_head binding_sock; + struct list_head local_publ; + struct list_head all_publ; struct rcu_head rcu; }; @@ -87,7 +90,10 @@ struct publication { * struct name_table - table containing all existing port name publications * @seq_hlist: name sequence hash lists * @node_scope: all local publications with node scope + * - used by name_distr during re-init of name table * @cluster_scope: all local publications with cluster scope + * - used by name_distr to send bulk updates to new nodes + * - used by name_distr during re-init of name table * @local_publ_count: number of publications issued by this node */ struct name_table { diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 910d3827f499..a4a9148d4629 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2605,7 +2605,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, if (unlikely(!publ)) return -EINVAL; - list_add(&publ->pport_list, &tsk->publications); + list_add(&publ->binding_sock, &tsk->publications); tsk->pub_count++; tsk->published = 1; return 0; @@ -2622,7 +2622,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, if (scope != TIPC_NODE_SCOPE) scope = TIPC_CLUSTER_SCOPE; - list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) { + list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) { if (seq) { if (publ->scope != scope) continue; @@ -2633,12 +2633,12 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, if (publ->upper != seq->upper) break; tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->ref, publ->key); + publ->port, publ->key); rc = 0; break; } tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->ref, publ->key); + publ->port, publ->key); rc = 0; } if (list_empty(&tsk->publications)) @@ -3292,7 +3292,7 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, struct publication *p; if (*last_publ) { - list_for_each_entry(p, &tsk->publications, pport_list) { + list_for_each_entry(p, &tsk->publications, binding_sock) { if (p->key == *last_publ) break; } @@ -3309,10 +3309,10 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb, } } else { p = list_first_entry(&tsk->publications, struct publication, - pport_list); + binding_sock); } - list_for_each_entry_from(p, &tsk->publications, pport_list) { + list_for_each_entry_from(p, &tsk->publications, binding_sock) { err = __tipc_nl_add_sk_publ(skb, cb, p); if (err) { *last_publ = p->key; -- cgit v1.2.3-70-g09d2 From 53d0e83f9329aa51dcc205b514dbee05cb4df309 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Mar 2018 03:54:26 -0700 Subject: rds: tcp: must use spin_lock_irq* and not spin_lock_bh with rds_tcp_conn_lock rds_tcp_connection allocation/free management has the potential to be called from __rds_conn_create after IRQs have been disabled, so spin_[un]lock_bh cannot be used with rds_tcp_conn_lock. Bottom-halves that need to synchronize for critical sections protected by rds_tcp_conn_lock should instead use rds_destroy_pending() correctly. Reported-by: syzbot+c68e51bb5e699d3f8d91@syzkaller.appspotmail.com Fixes: ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize netns/module teardown and rds connection/workq management") Signed-off-by: Sowmini Varadhan Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index eb04e7fa2467..08ea9cd5c2f6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -272,13 +272,14 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; + unsigned long flags; rdsdebug("freeing tc %p\n", tc); - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irqsave(&rds_tcp_conn_lock, flags); if (!tc->t_tcp_node_detached) list_del(&tc->t_tcp_node); - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); kmem_cache_free(rds_tcp_conn_slab, tc); } @@ -308,13 +309,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) rdsdebug("rds_conn_path [%d] tc %p\n", i, conn->c_path[i].cp_transport_data); } - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); for (i = 0; i < RDS_MPATH_WORKERS; i++) { tc = conn->c_path[i].cp_transport_data; tc->t_tcp_node_detached = false; list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list); } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); fail: if (ret) { for (j = 0; j < i; j++) @@ -527,7 +528,7 @@ static void rds_tcp_kill_sock(struct net *net) rtn->rds_tcp_listen_sock = NULL; rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -540,7 +541,7 @@ static void rds_tcp_kill_sock(struct net *net) tc->t_tcp_node_detached = true; } } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) rds_conn_destroy(tc->t_cpath->cp_conn); } @@ -588,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net) { struct rds_tcp_connection *tc, *_tc; - spin_lock_bh(&rds_tcp_conn_lock); + spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); @@ -598,7 +599,7 @@ static void rds_tcp_sysctl_reset(struct net *net) /* reconnect with new parameters */ rds_conn_path_drop(tc->t_cpath, false); } - spin_unlock_bh(&rds_tcp_conn_lock); + spin_unlock_irq(&rds_tcp_conn_lock); } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, -- cgit v1.2.3-70-g09d2 From d47d08c8ca052df3d9fde7cfff518660335b16e7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Mar 2018 23:32:51 +0000 Subject: sctp: use proc_remove_subtree() use proc_remove_subtree() for subtree removal, both on setup failure halfway through and on teardown. No need to make simple things complex... Signed-off-by: Al Viro Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 11 +----- net/sctp/objcnt.c | 8 ----- net/sctp/proc.c | 90 ++++++++++++------------------------------------- net/sctp/protocol.c | 59 ++++---------------------------- 4 files changed, 28 insertions(+), 140 deletions(-) (limited to 'net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index f7ae6b0a21d0..72c5b8fc3232 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -180,14 +180,7 @@ struct sctp_transport *sctp_epaddr_lookup_transport( /* * sctp/proc.c */ -int sctp_snmp_proc_init(struct net *net); -void sctp_snmp_proc_exit(struct net *net); -int sctp_eps_proc_init(struct net *net); -void sctp_eps_proc_exit(struct net *net); -int sctp_assocs_proc_init(struct net *net); -void sctp_assocs_proc_exit(struct net *net); -int sctp_remaddr_proc_init(struct net *net); -void sctp_remaddr_proc_exit(struct net *net); +int __net_init sctp_proc_init(struct net *net); /* * sctp/offload.c @@ -318,7 +311,6 @@ atomic_t sctp_dbg_objcnt_## name = ATOMIC_INIT(0) {.label= #name, .counter= &sctp_dbg_objcnt_## name} void sctp_dbg_objcnt_init(struct net *); -void sctp_dbg_objcnt_exit(struct net *); #else @@ -326,7 +318,6 @@ void sctp_dbg_objcnt_exit(struct net *); #define SCTP_DBG_OBJCNT_DEC(name) static inline void sctp_dbg_objcnt_init(struct net *net) { return; } -static inline void sctp_dbg_objcnt_exit(struct net *net) { return; } #endif /* CONFIG_SCTP_DBG_OBJCOUNT */ diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index aeea6da81441..fd2684ad94c8 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -130,11 +130,3 @@ void sctp_dbg_objcnt_init(struct net *net) if (!ent) pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); } - -/* Cleanup the objcount entry in the proc filesystem. */ -void sctp_dbg_objcnt_exit(struct net *net) -{ - remove_proc_entry("sctp_dbg_objcnt", net->sctp.proc_net_sctp); -} - - diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 537545ebcb0e..17d0155d9de3 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -101,25 +101,6 @@ static const struct file_operations sctp_snmp_seq_fops = { .release = single_release_net, }; -/* Set up the proc fs entry for 'snmp' object. */ -int __net_init sctp_snmp_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_snmp_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'snmp' object. */ -void sctp_snmp_proc_exit(struct net *net) -{ - remove_proc_entry("snmp", net->sctp.proc_net_sctp); -} - /* Dump local addresses of an association/endpoint. */ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) { @@ -259,25 +240,6 @@ static const struct file_operations sctp_eps_seq_fops = { .release = seq_release_net, }; -/* Set up the proc fs entry for 'eps' object. */ -int __net_init sctp_eps_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_eps_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'eps' object. */ -void sctp_eps_proc_exit(struct net *net) -{ - remove_proc_entry("eps", net->sctp.proc_net_sctp); -} - struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; @@ -390,25 +352,6 @@ static const struct file_operations sctp_assocs_seq_fops = { .release = seq_release_net, }; -/* Set up the proc fs entry for 'assocs' object. */ -int __net_init sctp_assocs_proc_init(struct net *net) -{ - struct proc_dir_entry *p; - - p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_assocs_seq_fops); - if (!p) - return -ENOMEM; - - return 0; -} - -/* Cleanup the proc fs entry for 'assocs' object. */ -void sctp_assocs_proc_exit(struct net *net) -{ - remove_proc_entry("assocs", net->sctp.proc_net_sctp); -} - static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; @@ -488,12 +431,6 @@ static const struct seq_operations sctp_remaddr_ops = { .show = sctp_remaddr_seq_show, }; -/* Cleanup the proc fs entry for 'remaddr' object. */ -void sctp_remaddr_proc_exit(struct net *net) -{ - remove_proc_entry("remaddr", net->sctp.proc_net_sctp); -} - static int sctp_remaddr_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &sctp_remaddr_ops, @@ -507,13 +444,28 @@ static const struct file_operations sctp_remaddr_seq_fops = { .release = seq_release_net, }; -int __net_init sctp_remaddr_proc_init(struct net *net) +/* Set up the proc fs entry for the SCTP protocol. */ +int __net_init sctp_proc_init(struct net *net) { - struct proc_dir_entry *p; - - p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_remaddr_seq_fops); - if (!p) + net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); + if (!net->sctp.proc_net_sctp) return -ENOMEM; + if (!proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp, + &sctp_snmp_seq_fops)) + goto cleanup; + if (!proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp, + &sctp_eps_seq_fops)) + goto cleanup; + if (!proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp, + &sctp_assocs_seq_fops)) + goto cleanup; + if (!proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp, + &sctp_remaddr_seq_fops)) + goto cleanup; return 0; + +cleanup: + remove_proc_subtree("sctp", net->proc_net); + net->sctp.proc_net_sctp = NULL; + return -ENOMEM; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 606361ee9e4a..493b817f6a2a 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -80,56 +80,6 @@ long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; -/* Set up the proc fs entry for the SCTP protocol. */ -static int __net_init sctp_proc_init(struct net *net) -{ -#ifdef CONFIG_PROC_FS - net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); - if (!net->sctp.proc_net_sctp) - goto out_proc_net_sctp; - if (sctp_snmp_proc_init(net)) - goto out_snmp_proc_init; - if (sctp_eps_proc_init(net)) - goto out_eps_proc_init; - if (sctp_assocs_proc_init(net)) - goto out_assocs_proc_init; - if (sctp_remaddr_proc_init(net)) - goto out_remaddr_proc_init; - - return 0; - -out_remaddr_proc_init: - sctp_assocs_proc_exit(net); -out_assocs_proc_init: - sctp_eps_proc_exit(net); -out_eps_proc_init: - sctp_snmp_proc_exit(net); -out_snmp_proc_init: - remove_proc_entry("sctp", net->proc_net); - net->sctp.proc_net_sctp = NULL; -out_proc_net_sctp: - return -ENOMEM; -#endif /* CONFIG_PROC_FS */ - return 0; -} - -/* Clean up the proc fs entry for the SCTP protocol. - * Note: Do not make this __exit as it is used in the init error - * path. - */ -static void sctp_proc_exit(struct net *net) -{ -#ifdef CONFIG_PROC_FS - sctp_snmp_proc_exit(net); - sctp_eps_proc_exit(net); - sctp_assocs_proc_exit(net); - sctp_remaddr_proc_exit(net); - - remove_proc_entry("sctp", net->proc_net); - net->sctp.proc_net_sctp = NULL; -#endif -} - /* Private helper to extract ipv4 address and stash them in * the protocol structure. */ @@ -1285,10 +1235,12 @@ static int __net_init sctp_defaults_init(struct net *net) if (status) goto err_init_mibs; +#ifdef CONFIG_PROC_FS /* Initialize proc fs directory. */ status = sctp_proc_init(net); if (status) goto err_init_proc; +#endif sctp_dbg_objcnt_init(net); @@ -1320,9 +1272,10 @@ static void __net_exit sctp_defaults_exit(struct net *net) sctp_free_addr_wq(net); sctp_free_local_addr_list(net); - sctp_dbg_objcnt_exit(net); - - sctp_proc_exit(net); +#ifdef CONFIG_PROC_FS + remove_proc_subtree("sctp", net->proc_net); + net->sctp.proc_net_sctp = NULL; +#endif cleanup_sctp_mibs(net); sctp_sysctl_net_unregister(net); } -- cgit v1.2.3-70-g09d2 From 2c3682f0be97a5f57c6c8b40fa154dfc77efb461 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:56:49 -0700 Subject: sock: make static tls function alloc_sg generic sock helper The TLS ULP module builds scatterlists from a sock using page_frag_refill(). This is going to be useful for other ULPs so move it into sock file for more general use. In the process remove useless goto at end of while loop. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 4 ++++ net/core/sock.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ net/tls/tls_sw.c | 69 ++++++------------------------------------------------ 3 files changed, 67 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index b9624581d639..447150c51feb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2141,6 +2141,10 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce); + /* * Default write policy as shown to user space via poll/select/SIGIO */ diff --git a/net/core/sock.c b/net/core/sock.c index 27f218bba43f..f68dff0d7bc4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2239,6 +2239,62 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) } EXPORT_SYMBOL(sk_page_frag_refill); +int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, + int *sg_num_elem, unsigned int *sg_size, + int first_coalesce) +{ + struct page_frag *pfrag; + unsigned int size = *sg_size; + int num_elem = *sg_num_elem, use = 0, rc = 0; + struct scatterlist *sge; + unsigned int orig_offset; + + len -= size; + pfrag = sk_page_frag(sk); + + while (len > 0) { + if (!sk_page_frag_refill(sk, pfrag)) { + rc = -ENOMEM; + goto out; + } + + use = min_t(int, len, pfrag->size - pfrag->offset); + + if (!sk_wmem_schedule(sk, use)) { + rc = -ENOMEM; + goto out; + } + + sk_mem_charge(sk, use); + size += use; + orig_offset = pfrag->offset; + pfrag->offset += use; + + sge = sg + num_elem - 1; + if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sg->offset + sg->length == orig_offset) { + sg->length += use; + } else { + sge++; + sg_unmark_end(sge); + sg_set_page(sge, pfrag->page, use, orig_offset); + get_page(pfrag->page); + ++num_elem; + if (num_elem == MAX_SKB_FRAGS) { + rc = -ENOSPC; + break; + } + } + + len -= use; + } +out: + *sg_size = size; + *sg_num_elem = num_elem; + return rc; +} +EXPORT_SYMBOL(sk_alloc_sg); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index f26376e954ae..0fc8a24c6473 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -87,71 +87,16 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); } -static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, - int first_coalesce) -{ - struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; - struct scatterlist *sge; - unsigned int orig_offset; - - len -= size; - pfrag = sk_page_frag(sk); - - while (len > 0) { - if (!sk_page_frag_refill(sk, pfrag)) { - rc = -ENOMEM; - goto out; - } - - use = min_t(int, len, pfrag->size - pfrag->offset); - - if (!sk_wmem_schedule(sk, use)) { - rc = -ENOMEM; - goto out; - } - - sk_mem_charge(sk, use); - size += use; - orig_offset = pfrag->offset; - pfrag->offset += use; - - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && - sg->offset + sg->length == orig_offset) { - sg->length += use; - } else { - sge++; - sg_unmark_end(sge); - sg_set_page(sge, pfrag->page, use, orig_offset); - get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { - rc = -ENOSPC; - break; - } - } - - len -= use; - } - goto out; - -out: - *sg_size = size; - *sg_num_elem = num_elem; - return rc; -} - static int alloc_encrypted_sg(struct sock *sk, int len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_encrypted_data, - &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); + rc = sk_alloc_sg(sk, len, + ctx->sg_encrypted_data, + &ctx->sg_encrypted_num_elem, + &ctx->sg_encrypted_size, 0); return rc; } @@ -162,9 +107,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = alloc_sg(sk, len, ctx->sg_plaintext_data, - &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, - tls_ctx->pending_open_record_frags); + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, + tls_ctx->pending_open_record_frags); return rc; } -- cgit v1.2.3-70-g09d2 From 312fc2b4c82e96a48cb2d0da2bd4816eb253c499 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:00 -0700 Subject: net: do_tcp_sendpages flag to avoid SKBTX_SHARED_FRAG When calling do_tcp_sendpages() from in kernel and we know the data has no references from user side we can omit SKBTX_SHARED_FRAG flag. This patch adds an internal flag, NO_SKBTX_SHARED_FRAG that can be used to omit setting SKBTX_SHARED_FRAG. The flag is not exposed to userspace because the sendpage call from the splice logic masks out all bits except MSG_MORE. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + net/ipv4/tcp.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/socket.h b/include/linux/socket.h index 1ce1f768a58c..60e01482a9c4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -287,6 +287,7 @@ struct ucred { #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fb350f740f69..f90ec24c2cc8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -994,7 +994,9 @@ new_segment: get_page(page); skb_fill_page_desc(skb, i, page, offset, copy); } - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; skb->len += copy; skb->data_len += copy; -- cgit v1.2.3-70-g09d2 From 8c05dbf04b2882c3c0bc43fe7668c720210877f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:05 -0700 Subject: net: generalize sk_alloc_sg to work with scatterlist rings The current implementation of sk_alloc_sg expects scatterlist to always start at entry 0 and complete at entry MAX_SKB_FRAGS. Future patches will want to support starting at arbitrary offset into scatterlist so add an additional sg_start parameters and then default to the current values in TLS code paths. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/sock.h | 2 +- net/core/sock.c | 27 ++++++++++++++++----------- net/tls/tls_sw.c | 4 ++-- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 447150c51feb..b7c75e024e37 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2142,7 +2142,7 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr, unsigned int *sg_size, int first_coalesce); /* diff --git a/net/core/sock.c b/net/core/sock.c index f68dff0d7bc4..4f92c2910200 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2240,19 +2240,20 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) EXPORT_SYMBOL(sk_page_frag_refill); int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, - int *sg_num_elem, unsigned int *sg_size, + int sg_start, int *sg_curr_index, unsigned int *sg_curr_size, int first_coalesce) { + int sg_curr = *sg_curr_index, use = 0, rc = 0; + unsigned int size = *sg_curr_size; struct page_frag *pfrag; - unsigned int size = *sg_size; - int num_elem = *sg_num_elem, use = 0, rc = 0; struct scatterlist *sge; - unsigned int orig_offset; len -= size; pfrag = sk_page_frag(sk); while (len > 0) { + unsigned int orig_offset; + if (!sk_page_frag_refill(sk, pfrag)) { rc = -ENOMEM; goto out; @@ -2270,17 +2271,21 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, orig_offset = pfrag->offset; pfrag->offset += use; - sge = sg + num_elem - 1; - if (num_elem > first_coalesce && sg_page(sg) == pfrag->page && + sge = sg + sg_curr - 1; + if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && sg->offset + sg->length == orig_offset) { sg->length += use; } else { - sge++; + sge = sg + sg_curr; sg_unmark_end(sge); sg_set_page(sge, pfrag->page, use, orig_offset); get_page(pfrag->page); - ++num_elem; - if (num_elem == MAX_SKB_FRAGS) { + sg_curr++; + + if (sg_curr == MAX_SKB_FRAGS) + sg_curr = 0; + + if (sg_curr == sg_start) { rc = -ENOSPC; break; } @@ -2289,8 +2294,8 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg, len -= use; } out: - *sg_size = size; - *sg_num_elem = num_elem; + *sg_curr_size = size; + *sg_curr_index = sg_curr; return rc; } EXPORT_SYMBOL(sk_alloc_sg); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 0fc8a24c6473..057a558ed6d7 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -94,7 +94,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len) int rc = 0; rc = sk_alloc_sg(sk, len, - ctx->sg_encrypted_data, + ctx->sg_encrypted_data, 0, &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0); @@ -107,7 +107,7 @@ static int alloc_plaintext_sg(struct sock *sk, int len) struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); int rc = 0; - rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, + rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0, &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size, tls_ctx->pending_open_record_frags); -- cgit v1.2.3-70-g09d2 From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 ++ include/uapi/linux/bpf.h | 22 +- kernel/bpf/sockmap.c | 712 +++++++++++++++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 14 +- kernel/bpf/verifier.c | 5 +- net/core/filter.c | 106 +++++++ 8 files changed, 857 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66df387106de..819229c80eca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index fdb691b520c0..109d05ccea9a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e15d1724d89..ef529539abde 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SOCK_OPS, BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, }; enum bpf_attach_type { @@ -143,6 +144,7 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_PARSER, BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -718,6 +720,15 @@ union bpf_attr { * int bpf_override_return(pt_regs, rc) * @pt_regs: pointer to struct pt_regs * @rc: the return value to set + * + * int bpf_msg_redirect_map(map, key, flags) + * Redirect msg to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_PASS + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -779,7 +790,8 @@ union bpf_attr { FN(perf_prog_read_value), \ FN(getsockopt), \ FN(override_return), \ - FN(sock_ops_cb_flags_set), + FN(sock_ops_cb_flags_set), \ + FN(msg_redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -942,6 +954,14 @@ enum sk_action { SK_PASS, }; +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + void *data; + void *data_end; +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 051b224270e3..69c5bccabd22 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct bpf_stab { struct bpf_map map; struct sock **sock_map; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; }; @@ -73,7 +75,16 @@ struct smap_psock { int save_off; struct sk_buff *save_skb; + /* datapath variables for tx_msg ULP */ + struct sock *sk_redir; + int apply_bytes; + int cork_bytes; + int sg_size; + int eval; + struct sk_msg_buff *cork; + struct strparser strp; + struct bpf_prog *bpf_tx_msg; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; struct list_head maps; @@ -91,6 +102,11 @@ struct smap_psock { void (*save_write_space)(struct sock *sk); }; +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags); + static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { return rcu_dereference_sk_user_data(sk); @@ -115,27 +131,41 @@ static int bpf_tcp_init(struct sock *sk) psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; + + if (psock->bpf_tx_msg) { + tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; + tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + } + sk->sk_prot = &tcp_bpf_proto; rcu_read_unlock(); return 0; } +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md); + static void bpf_tcp_release(struct sock *sk) { struct smap_psock *psock; rcu_read_lock(); psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; - if (likely(psock)) { - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; } + + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; +out: rcu_read_unlock(); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); @@ -174,6 +204,7 @@ enum __sk_action { __SK_DROP = 0, __SK_PASS, __SK_REDIRECT, + __SK_NONE, }; static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { @@ -185,10 +216,621 @@ static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { .release = bpf_tcp_release, }; +static int memcopy_from_iter(struct sock *sk, + struct sk_msg_buff *md, + struct iov_iter *from, int bytes) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_curr, rc = -ENOSPC; + + do { + int copy; + char *to; + + if (md->sg_copybreak >= sg[i].length) { + md->sg_copybreak = 0; + + if (++i == MAX_SKB_FRAGS) + i = 0; + + if (i == md->sg_end) + break; + } + + copy = sg[i].length - md->sg_copybreak; + to = sg_virt(&sg[i]) + md->sg_copybreak; + md->sg_copybreak += copy; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(to, copy, from); + else + rc = copy_from_iter(to, copy, from); + + if (rc != copy) { + rc = -EFAULT; + goto out; + } + + bytes -= copy; + if (!bytes) + break; + + md->sg_copybreak = 0; + if (++i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +out: + md->sg_curr = i; + return rc; +} + +static int bpf_tcp_push(struct sock *sk, int apply_bytes, + struct sk_msg_buff *md, + int flags, bool uncharge) +{ + bool apply = apply_bytes; + struct scatterlist *sg; + int offset, ret = 0; + struct page *p; + size_t size; + + while (1) { + sg = md->sg_data + md->sg_start; + size = (apply && apply_bytes < sg->length) ? + apply_bytes : sg->length; + offset = sg->offset; + + tcp_rate_check_app_limited(sk); + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + if (apply) + apply_bytes -= ret; + size -= ret; + offset += ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + goto retry; + } + + sg->length = size; + sg->offset = offset; + return ret; + } + + if (apply) + apply_bytes -= ret; + sg->offset += ret; + sg->length -= ret; + if (uncharge) + sk_mem_uncharge(sk, ret); + + if (!sg->length) { + put_page(p); + md->sg_start++; + if (md->sg_start == MAX_SKB_FRAGS) + md->sg_start = 0; + memset(sg, 0, sizeof(*sg)); + + if (md->sg_start == md->sg_end) + break; + } + + if (apply && !apply_bytes) + break; + } + return 0; +} + +static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data + md->sg_start; + + if (md->sg_copy[md->sg_start]) { + md->data = md->data_end = 0; + } else { + md->data = sg_virt(sg); + md->data_end = md->data + sg->length; + } +} + +static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start; + + do { + int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length; + + sk_mem_uncharge(sk, uncharge); + bytes -= uncharge; + if (!bytes) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != md->sg_end); +} + +static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = md->sg_start, free; + + while (bytes && sg[i].length) { + free = sg[i].length; + if (bytes < free) { + sg[i].length -= bytes; + sg[i].offset += bytes; + sk_mem_uncharge(sk, bytes); + break; + } + + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + bytes -= sg[i].length; + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } +} + +static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) +{ + struct scatterlist *sg = md->sg_data; + int i = start, free = 0; + + while (sg[i].length) { + free += sg[i].length; + sk_mem_uncharge(sk, sg[i].length); + put_page(sg_page(&sg[i])); + sg[i].length = 0; + sg[i].page_link = 0; + sg[i].offset = 0; + i++; + + if (i == MAX_SKB_FRAGS) + i = 0; + } + + return free; +} + +static int free_start_sg(struct sock *sk, struct sk_msg_buff *md) +{ + int free = free_sg(sk, md->sg_start, md); + + md->sg_start = md->sg_end; + return free; +} + +static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md) +{ + return free_sg(sk, md->sg_curr, md); +} + +static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md) +{ + return ((_rc == SK_PASS) ? + (md->map ? __SK_REDIRECT : __SK_PASS) : + __SK_DROP); +} + +static unsigned int smap_do_tx_msg(struct sock *sk, + struct smap_psock *psock, + struct sk_msg_buff *md) +{ + struct bpf_prog *prog; + unsigned int rc, _rc; + + preempt_disable(); + rcu_read_lock(); + + /* If the policy was removed mid-send then default to 'accept' */ + prog = READ_ONCE(psock->bpf_tx_msg); + if (unlikely(!prog)) { + _rc = SK_PASS; + goto verdict; + } + + bpf_compute_data_pointers_sg(md); + rc = (*prog->bpf_func)(md, prog->insnsi); + psock->apply_bytes = md->apply_bytes; + + /* Moving return codes from UAPI namespace into internal namespace */ + _rc = bpf_map_msg_verdict(rc, md); + + /* The psock has a refcount on the sock but not on the map and because + * we need to drop rcu read lock here its possible the map could be + * removed between here and when we need it to execute the sock + * redirect. So do the map lookup now for future use. + */ + if (_rc == __SK_REDIRECT) { + if (psock->sk_redir) + sock_put(psock->sk_redir); + psock->sk_redir = do_msg_redirect_map(md); + if (!psock->sk_redir) { + _rc = __SK_DROP; + goto verdict; + } + sock_hold(psock->sk_redir); + } +verdict: + rcu_read_unlock(); + preempt_enable(); + + return _rc; +} + +static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, + struct sk_msg_buff *md, + int flags) +{ + struct smap_psock *psock; + struct scatterlist *sg; + int i, err, free = 0; + + sg = md->sg_data; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out_rcu; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto out_rcu; + + rcu_read_unlock(); + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + smap_release_sock(psock, sk); + if (unlikely(err)) + goto out; + return 0; +out_rcu: + rcu_read_unlock(); +out: + i = md->sg_start; + while (sg[i].length) { + free += sg[i].length; + put_page(sg_page(&sg[i])); + sg[i].length = 0; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } + return free; +} + +static inline void bpf_md_init(struct smap_psock *psock) +{ + if (!psock->apply_bytes) { + psock->eval = __SK_NONE; + if (psock->sk_redir) { + sock_put(psock->sk_redir); + psock->sk_redir = NULL; + } + } +} + +static void apply_bytes_dec(struct smap_psock *psock, int i) +{ + if (psock->apply_bytes) { + if (psock->apply_bytes < i) + psock->apply_bytes = 0; + else + psock->apply_bytes -= i; + } +} + +static int bpf_exec_tx_verdict(struct smap_psock *psock, + struct sk_msg_buff *m, + struct sock *sk, + int *copied, int flags) +{ + bool cork = false, enospc = (m->sg_start == m->sg_end); + struct sock *redir; + int err = 0; + int send; + +more_data: + if (psock->eval == __SK_NONE) + psock->eval = smap_do_tx_msg(sk, psock, m); + + if (m->cork_bytes && + m->cork_bytes > psock->sg_size && !enospc) { + psock->cork_bytes = m->cork_bytes - psock->sg_size; + if (!psock->cork) { + psock->cork = kcalloc(1, + sizeof(struct sk_msg_buff), + GFP_ATOMIC | __GFP_NOWARN); + + if (!psock->cork) { + err = -ENOMEM; + goto out_err; + } + } + memcpy(psock->cork, m, sizeof(*m)); + goto out_err; + } + + send = psock->sg_size; + if (psock->apply_bytes && psock->apply_bytes < send) + send = psock->apply_bytes; + + switch (psock->eval) { + case __SK_PASS: + err = bpf_tcp_push(sk, send, m, flags, true); + if (unlikely(err)) { + *copied -= free_start_sg(sk, m); + break; + } + + apply_bytes_dec(psock, send); + psock->sg_size -= send; + break; + case __SK_REDIRECT: + redir = psock->sk_redir; + apply_bytes_dec(psock, send); + + if (psock->cork) { + cork = true; + psock->cork = NULL; + } + + return_mem_sg(sk, send, m); + release_sock(sk); + + err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); + lock_sock(sk); + + if (cork) { + free_start_sg(sk, m); + kfree(m); + m = NULL; + } + if (unlikely(err)) + *copied -= err; + else + psock->sg_size -= send; + break; + case __SK_DROP: + default: + free_bytes_sg(sk, send, m); + apply_bytes_dec(psock, send); + *copied -= send; + psock->sg_size -= send; + err = -EACCES; + break; + } + + if (likely(!err)) { + bpf_md_init(psock); + if (m && + m->sg_data[m->sg_start].page_link && + m->sg_data[m->sg_start].length) + goto more_data; + } + +out_err: + return err; +} + +static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; + struct sk_msg_buff md = {0}; + unsigned int sg_copy = 0; + struct smap_psock *psock; + int copied = 0, err = 0; + struct scatterlist *sg; + long timeo; + + /* Its possible a sock event or user removed the psock _but_ the ops + * have not been reprogrammed yet so we get here. In this case fallback + * to tcp_sendmsg. Note this only works because we _only_ ever allow + * a single ULP there is no hierarchy here. + */ + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + /* Increment the psock refcnt to ensure its not released while sending a + * message. Required because sk lookup and bpf programs are used in + * separate rcu critical sections. Its OK if we lose the map entry + * but we can't lose the sock reference. + */ + if (!refcount_inc_not_zero(&psock->refcnt)) { + rcu_read_unlock(); + return tcp_sendmsg(sk, msg, size); + } + + sg = md.sg_data; + sg_init_table(sg, MAX_SKB_FRAGS); + rcu_read_unlock(); + + lock_sock(sk); + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + while (msg_data_left(msg)) { + struct sk_msg_buff *m; + bool enospc = false; + int copy; + + if (sk->sk_err) { + err = sk->sk_err; + goto out_err; + } + + copy = msg_data_left(msg); + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + m = psock->cork_bytes ? psock->cork : &md; + m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end; + err = sk_alloc_sg(sk, copy, m->sg_data, + m->sg_start, &m->sg_end, &sg_copy, + m->sg_end - 1); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + enospc = true; + copy = sg_copy; + } + + err = memcopy_from_iter(sk, m, &msg->msg_iter, copy); + if (err < 0) { + free_curr_sg(sk, m); + goto out_err; + } + + psock->sg_size += copy; + copied += copy; + sg_copy = 0; + + /* When bytes are being corked skip running BPF program and + * applying verdict unless there is no more buffer space. In + * the ENOSPC case simply run BPF prorgram with currently + * accumulated data. We don't have much choice at this point + * we could try extending the page frags or chaining complex + * frags but even in these cases _eventually_ we will hit an + * OOM scenario. More complex recovery schemes may be + * implemented in the future, but BPF programs must handle + * the case where apply_cork requests are not honored. The + * canonical method to verify this is to check data length. + */ + if (psock->cork_bytes) { + if (copy > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= copy; + + if (psock->cork_bytes && !enospc) + goto out_cork; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); + if (unlikely(err < 0)) + goto out_err; + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + } +out_err: + if (err < 0) + err = sk_stream_error(sk, msg->msg_flags, err); +out_cork: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +} + +static int bpf_tcp_sendpage(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct sk_msg_buff md = {0}, *m = NULL; + int err = 0, copied = 0; + struct smap_psock *psock; + struct scatterlist *sg; + bool enospc = false; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto accept; + + if (!refcount_inc_not_zero(&psock->refcnt)) + goto accept; + rcu_read_unlock(); + + lock_sock(sk); + + if (psock->cork_bytes) + m = psock->cork; + else + m = &md; + + /* Catch case where ring is full and sendpage is stalled. */ + if (unlikely(m->sg_end == m->sg_start && + m->sg_data[m->sg_end].length)) + goto out_err; + + psock->sg_size += size; + sg = &m->sg_data[m->sg_end]; + sg_set_page(sg, page, size, offset); + get_page(page); + m->sg_copy[m->sg_end] = true; + sk_mem_charge(sk, size); + m->sg_end++; + copied = size; + + if (m->sg_end == MAX_SKB_FRAGS) + m->sg_end = 0; + + if (m->sg_end == m->sg_start) + enospc = true; + + if (psock->cork_bytes) { + if (size > psock->cork_bytes) + psock->cork_bytes = 0; + else + psock->cork_bytes -= size; + + if (psock->cork_bytes && !enospc) + goto out_err; + + /* All cork bytes accounted for re-run filter */ + psock->eval = __SK_NONE; + psock->cork_bytes = 0; + } + + err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags); +out_err: + release_sock(sk); + smap_release_sock(psock, sk); + return copied ? copied : err; +accept: + rcu_read_unlock(); + return tcp_sendpage(sk, page, offset, size, flags); +} + +static void bpf_tcp_msg_add(struct smap_psock *psock, + struct sock *sk, + struct bpf_prog *tx_msg) +{ + struct bpf_prog *orig_tx_msg; + + orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg); + if (orig_tx_msg) + bpf_prog_put(orig_tx_msg); +} + static int bpf_tcp_ulp_register(void) { tcp_bpf_proto = tcp_prot; tcp_bpf_proto.close = bpf_tcp_close; + /* Once BPF TX ULP is registered it is never unregistered. It + * will be in the ULP list for the lifetime of the system. Doing + * duplicate registers is not a problem. + */ return tcp_register_ulp(&bpf_tcp_ulp_ops); } @@ -412,7 +1054,6 @@ static int smap_parse_func_strparser(struct strparser *strp, return rc; } - static int smap_read_sock_done(struct strparser *strp, int err) { return err; @@ -482,12 +1123,22 @@ static void smap_gc_work(struct work_struct *w) bpf_prog_put(psock->bpf_parse); if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); + if (psock->bpf_tx_msg) + bpf_prog_put(psock->bpf_tx_msg); + + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + } list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); } + if (psock->sk_redir) + sock_put(psock->sk_redir); + sock_put(psock->sock); kfree(psock); } @@ -503,6 +1154,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, if (!psock) return ERR_PTR(-ENOMEM); + psock->eval = __SK_NONE; psock->sock = sock; skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); @@ -711,10 +1363,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct smap_psock_map_entry *e = NULL; - struct bpf_prog *verdict, *parse; + struct bpf_prog *verdict, *parse, *tx_msg; struct sock *osock, *sock; struct smap_psock *psock; u32 i = *(u32 *)key; + bool new = false; int err; if (unlikely(flags > BPF_EXIST)) @@ -737,6 +1390,7 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, */ verdict = READ_ONCE(stab->bpf_verdict); parse = READ_ONCE(stab->bpf_parse); + tx_msg = READ_ONCE(stab->bpf_tx_msg); if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation @@ -755,6 +1409,17 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } + if (tx_msg) { + tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + if (IS_ERR(tx_msg)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(tx_msg); + } + } + write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); @@ -769,7 +1434,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, err = -EBUSY; goto out_progs; } - refcount_inc(&psock->refcnt); + if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) { + err = -EBUSY; + goto out_progs; + } + if (!refcount_inc_not_zero(&psock->refcnt)) { + err = -EAGAIN; + goto out_progs; + } } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { @@ -777,11 +1449,8 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } - err = tcp_set_ulp_id(sock, TCP_ULP_BPF); - if (err) - goto out_progs; - set_bit(SMAP_TX_RUNNING, &psock->state); + new = true; } e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); @@ -794,6 +1463,14 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, /* 3. At this point we have a reference to a valid psock that is * running. Attach any BPF programs needed. */ + if (tx_msg) + bpf_tcp_msg_add(psock, sock, tx_msg); + if (new) { + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_free; + } + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) @@ -815,8 +1492,6 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct smap_psock *opsock = smap_psock_sk(osock); write_lock_bh(&osock->sk_callback_lock); - if (osock != sock && parse) - smap_stop_sock(opsock, osock); smap_list_remove(opsock, &stab->sock_map[i]); smap_release_sock(opsock, osock); write_unlock_bh(&osock->sk_callback_lock); @@ -829,6 +1504,8 @@ out_progs: bpf_prog_put(verdict); if (parse) bpf_prog_put(parse); + if (tx_msg) + bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); kfree(e); return err; @@ -843,6 +1520,9 @@ int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) return -EINVAL; switch (type) { + case BPF_SK_MSG_VERDICT: + orig = xchg(&stab->bpf_tx_msg, prog); + break; case BPF_SK_SKB_STREAM_PARSER: orig = xchg(&stab->bpf_parse, prog); break; @@ -904,6 +1584,10 @@ static void sock_map_release(struct bpf_map *map, struct file *map_file) orig = xchg(&stab->bpf_verdict, NULL); if (orig) bpf_prog_put(orig); + + orig = xchg(&stab->bpf_tx_msg, NULL); + if (orig) + bpf_prog_put(orig); } const struct bpf_map_ops sock_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..3aeb4ea2a93a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1315,7 +1315,8 @@ static int bpf_obj_get(const union bpf_attr *attr) #define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) +static int sockmap_get_from_fd(const union bpf_attr *attr, + int type, bool attach) { struct bpf_prog *prog = NULL; int ufd = attr->target_fd; @@ -1329,8 +1330,7 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach) return PTR_ERR(map); if (attach) { - prog = bpf_prog_get_type(attr->attach_bpf_fd, - BPF_PROG_TYPE_SK_SKB); + prog = bpf_prog_get_type(attr->attach_bpf_fd, type); if (IS_ERR(prog)) { fdput(f); return PTR_ERR(prog); @@ -1382,9 +1382,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, true); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, true); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, true); default: return -EINVAL; } @@ -1437,9 +1439,11 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_DEVICE: ptype = BPF_PROG_TYPE_CGROUP_DEVICE; break; + case BPF_SK_MSG_VERDICT: + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, false); case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: - return sockmap_get_from_fd(attr, false); + return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, false); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb79a34359c0..e9f7c20691c1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1248,6 +1248,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: if (meta) return meta->pkt_access; @@ -2071,7 +2072,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_SOCKMAP: if (func_id != BPF_FUNC_sk_redirect_map && func_id != BPF_FUNC_sock_map_update && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_msg_redirect_map) goto error; break; default: @@ -2109,6 +2111,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_sk_redirect_map: + case BPF_FUNC_msg_redirect_map: if (map->map_type != BPF_MAP_TYPE_SOCKMAP) goto error; break; diff --git a/net/core/filter.c b/net/core/filter.c index 33edfa8372fd..2b6c47597eab 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) +{ + /* If user passes invalid input drop the packet. */ + if (unlikely(flags)) + return SK_DROP; + + msg->key = key; + msg->flags = flags; + msg->map = map; + + return SK_PASS; +} + +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + struct sock *sk = NULL; + + if (msg->map) { + sk = __sock_map_lookup_elem(msg->map, msg->key); + + msg->key = 0; + msg->map = NULL; + } + + return sk; +} + +static const struct bpf_func_proto bpf_msg_redirect_map_proto = { + .func = bpf_msg_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3591,6 +3629,16 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_msg_redirect_map: + return &bpf_msg_redirect_map_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, info); } +static bool sk_msg_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct sk_msg_md, data): + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct sk_msg_md, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + if (off < 0 || off >= sizeof(struct sk_msg_md)) + return false; + if (off % size != 0) + return false; + if (size != sizeof(__u64)) + return false; + + return true; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct sk_msg_md, data): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data)); + break; + case offsetof(struct sk_msg_md, data_end): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg_buff, data_end)); + break; + } + + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = { const struct bpf_prog_ops sk_skb_prog_ops = { }; +const struct bpf_verifier_ops sk_msg_verifier_ops = { + .get_func_proto = sk_msg_func_proto, + .is_valid_access = sk_msg_is_valid_access, + .convert_ctx_access = sk_msg_convert_ctx_access, +}; + +const struct bpf_prog_ops sk_msg_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3-70-g09d2 From 2a100317c9ebc204a166f16294884fbf9da074ce Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:15 -0700 Subject: bpf: sockmap, add bpf_msg_apply_bytes() helper A single sendmsg or sendfile system call can contain multiple logical messages that a BPF program may want to read and apply a verdict. But, without an apply_bytes helper any verdict on the data applies to all bytes in the sendmsg/sendfile. Alternatively, a BPF program may only care to read the first N bytes of a msg. If the payload is large say MB or even GB setting up and calling the BPF program repeatedly for all bytes, even though the verdict is already known, creates unnecessary overhead. To allow BPF programs to control how many bytes a given verdict applies to we implement a bpf_msg_apply_bytes() helper. When called from within a BPF program this sets a counter, internal to the BPF infrastructure, that applies the last verdict to the next N bytes. If the N is smaller than the current data being processed from a sendmsg/sendfile call, the first N bytes will be sent and the BPF program will be re-run with start_data pointing to the N+1 byte. If N is larger than the current data being processed the BPF verdict will be applied to multiple sendmsg/sendfile calls until N bytes are consumed. Note1 if a socket closes with apply_bytes counter non-zero this is not a problem because data is not being buffered for N bytes and is sent as its received. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef529539abde..a557a2a5d72d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -791,7 +791,8 @@ union bpf_attr { FN(getsockopt), \ FN(override_return), \ FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), + FN(msg_redirect_map), \ + FN(msg_apply_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 2b6c47597eab..17d6775f5431 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1928,6 +1928,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->apply_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { + .func = bpf_msg_apply_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; + case BPF_FUNC_msg_apply_bytes: + return &bpf_msg_apply_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-70-g09d2 From 91843d540a139eb8070bcff8aa10089164436deb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:20 -0700 Subject: bpf: sockmap, add msg_cork_bytes() helper In the case where we need a specific number of bytes before a verdict can be assigned, even if the data spans multiple sendmsg or sendfile calls. The BPF program may use msg_cork_bytes(). The extreme case is a user can call sendmsg repeatedly with 1-byte msg segments. Obviously, this is bad for performance but is still valid. If the BPF program needs N bytes to validate a header it can use msg_cork_bytes to specify N bytes and the BPF program will not be called again until N bytes have been accumulated. The infrastructure will attempt to coalesce data if possible so in many cases (most my use cases at least) the data will be in a single scatterlist element with data pointers pointing to start/end of the element. However, this is dependent on available memory so is not guaranteed. So BPF programs must validate data pointer ranges, but this is the case anyways to convince the verifier the accesses are valid. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a557a2a5d72d..1765cfb16c99 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -792,7 +792,8 @@ union bpf_attr { FN(override_return), \ FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ - FN(msg_apply_bytes), + FN(msg_apply_bytes), \ + FN(msg_cork_bytes), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 17d6775f5431..0c9daf6ee555 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1942,6 +1942,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) +{ + msg->cork_bytes = bytes; + return 0; +} + +static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { + .func = bpf_msg_cork_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3650,6 +3664,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; + case BPF_FUNC_msg_cork_bytes: + return &bpf_msg_cork_bytes_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-70-g09d2 From 015632bb30daaaee64e1bcac07570860e0bf3092 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:25 -0700 Subject: bpf: sk_msg program helper bpf_sk_msg_pull_data Currently, if a bpf sk msg program is run the program can only parse data that the (start,end) pointers already consumed. For sendmsg hooks this is likely the first scatterlist element. For sendpage this will be the range (0,0) because the data is shared with userspace and by default we want to avoid allowing userspace to modify data while (or after) BPF verdict is being decided. To support pulling in additional bytes for parsing use a new helper bpf_sk_msg_pull(start, end, flags) which works similar to cls tc logic. This helper will attempt to point the data start pointer at 'start' bytes offest into msg and data end pointer at 'end' bytes offset into message. After basic sanity checks to ensure 'start' <= 'end' and 'end' <= msg_length there are a few cases we need to handle. First the sendmsg hook has already copied the data from userspace and has exclusive access to it. Therefor, it is not necessesary to copy the data. However, it may be required. After finding the scatterlist element with 'start' offset byte in it there are two cases. One the range (start,end) is entirely contained in the sg element and is already linear. All that is needed is to update the data pointers, no allocate/copy is needed. The other case is (start, end) crosses sg element boundaries. In this case we allocate a block of size 'end - start' and copy the data to linearize it. Next sendpage hook has not copied any data in initial state so that data pointers are (0,0). In this case we handle it similar to the above sendmsg case except the allocation/copy must always happen. Then when sending the data we have possibly three memory regions that need to be sent, (0, start - 1), (start, end), and (end + 1, msg_length). This is required to ensure any writes by the BPF program are correctly transmitted. Lastly this operation will invalidate any previous data checks so BPF programs will have to revalidate pointers after making this BPF call. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 +- net/core/filter.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1765cfb16c99..18b7c510c511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -793,7 +793,8 @@ union bpf_attr { FN(sock_ops_cb_flags_set), \ FN(msg_redirect_map), \ FN(msg_apply_bytes), \ - FN(msg_cork_bytes), + FN(msg_cork_bytes), \ + FN(msg_pull_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 0c9daf6ee555..c86f03fd9ea5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_pull_data, + struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) +{ + unsigned int len = 0, offset = 0, copy = 0; + struct scatterlist *sg = msg->sg_data; + int first_sg, last_sg, i, shift; + unsigned char *p, *to, *from; + int bytes = end - start; + struct page *page; + + if (unlikely(flags || end <= start)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg_start; + do { + len = sg[i].length; + offset += len; + if (start < offset + len) + break; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != msg->sg_end); + + if (unlikely(start >= offset + len)) + return -EINVAL; + + if (!msg->sg_copy[i] && bytes <= len) + goto out; + + first_sg = i; + + /* At this point we need to linearize multiple scatterlist + * elements or a single shared page. Either way we need to + * copy into a linear buffer exclusively owned by BPF. Then + * place the buffer in the scatterlist and fixup the original + * entries by removing the entries now in the linear buffer + * and shifting the remaining entries. For now we do not try + * to copy partial entries to avoid complexity of running out + * of sg_entry slots. The downside is reading a single byte + * will copy the entire sg entry. + */ + do { + copy += sg[i].length; + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (bytes < copy) + break; + } while (i != msg->sg_end); + last_sg = i; + + if (unlikely(copy < end - start)) + return -EINVAL; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); + if (unlikely(!page)) + return -ENOMEM; + p = page_address(page); + offset = 0; + + i = first_sg; + do { + from = sg_virt(&sg[i]); + len = sg[i].length; + to = p + offset; + + memcpy(to, from, len); + offset += len; + sg[i].length = 0; + put_page(sg_page(&sg[i])); + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (i != last_sg); + + sg[first_sg].length = copy; + sg_set_page(&sg[first_sg], page, copy, 0); + + /* To repair sg ring we need to shift entries. If we only + * had a single entry though we can just replace it and + * be done. Otherwise walk the ring and shift the entries. + */ + shift = last_sg - first_sg - 1; + if (!shift) + goto out; + + i = first_sg + 1; + do { + int move_from; + + if (i + shift >= MAX_SKB_FRAGS) + move_from = i + shift - MAX_SKB_FRAGS; + else + move_from = i + shift; + + if (move_from == msg->sg_end) + break; + + sg[i] = sg[move_from]; + sg[move_from].length = 0; + sg[move_from].page_link = 0; + sg[move_from].offset = 0; + + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + } while (1); + msg->sg_end -= shift; + if (msg->sg_end < 0) + msg->sg_end += MAX_SKB_FRAGS; +out: + msg->data = sg_virt(&sg[i]) + start - offset; + msg->data_end = msg->data + bytes; + + return 0; +} + +static const struct bpf_func_proto bpf_msg_pull_data_proto = { + .func = bpf_msg_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta) + func == bpf_xdp_adjust_meta || + func == bpf_msg_pull_data) return true; return false; @@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; + case BPF_FUNC_msg_pull_data: + return &bpf_msg_pull_data_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3-70-g09d2 From 6aec208786c2a54cbf6135a0242b224e845bef98 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Sun, 4 Mar 2018 15:29:51 -0800 Subject: netfilter: Refactor nf_conncount Remove parameter 'family' in nf_conncount_count() and count_tree(). It is because the parameter is not useful after commit 625c556118f3 ("netfilter: connlimit: split xt_connlimit into front and backend"). Signed-off-by: Yi-Hung Wei Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 1 - net/netfilter/nf_conncount.c | 4 +--- net/netfilter/xt_connlimit.c | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index adf8db44cf86..e61184fbfb71 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -11,7 +11,6 @@ void nf_conncount_destroy(struct net *net, unsigned int family, unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, - unsigned int family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone); #endif diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 6d65389e308f..9305a08b4422 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -158,7 +158,6 @@ static void tree_nodes_free(struct rb_root *root, static unsigned int count_tree(struct net *net, struct rb_root *root, const u32 *key, u8 keylen, - u8 family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -246,7 +245,6 @@ count_tree(struct net *net, struct rb_root *root, unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, - unsigned int family, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone) { @@ -259,7 +257,7 @@ unsigned int nf_conncount_count(struct net *net, spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); - count = count_tree(net, root, key, data->keylen, family, tuple, zone); + count = count_tree(net, root, key, data->keylen, tuple, zone); spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index b1b17b9353e1..6275106ccf50 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -67,8 +67,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, - xt_family(par), tuple_ptr, zone); + connections = nf_conncount_count(net, info->data, key, tuple_ptr, + zone); if (connections == 0) /* kmalloc failed, drop it entirely */ goto hotdrop; -- cgit v1.2.3-70-g09d2 From 35d8deb80c30fdb2dee3e2dac71eab00d8a6fed5 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Sun, 4 Mar 2018 15:29:52 -0800 Subject: netfilter: conncount: Support count only use case Currently, nf_conncount_count() counts the number of connections that matches key and inserts a conntrack 'tuple' with the same key into the accounting data structure. This patch supports another use case that only counts the number of connections where 'tuple' is not provided. Therefore, proper changes are made on nf_conncount_count() to support the case where 'tuple' is NULL. This could be useful for querying statistics or debugging purpose. Signed-off-by: Yi-Hung Wei Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 9305a08b4422..153e690e2893 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -104,7 +104,7 @@ static unsigned int check_hlist(struct net *net, struct nf_conn *found_ct; unsigned int length = 0; - *addit = true; + *addit = tuple ? true : false; /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { @@ -117,7 +117,7 @@ static unsigned int check_hlist(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple)) { + if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) { /* * Just to be sure we have it only once in the list. * We should not see tuples twice unless someone hooks @@ -220,6 +220,9 @@ count_tree(struct net *net, struct rb_root *root, goto restart; } + if (!tuple) + return 0; + /* no match, need to insert new node */ rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); if (rbconn == NULL) @@ -242,6 +245,9 @@ count_tree(struct net *net, struct rb_root *root, return 1; } +/* Count and return number of conntrack entries in 'net' with particular 'key'. + * If 'tuple' is not null, insert it into the accounting data structure. + */ unsigned int nf_conncount_count(struct net *net, struct nf_conncount_data *data, const u32 *key, -- cgit v1.2.3-70-g09d2 From d719e3f21cf91d3f82bd827d46199ba41af2f73a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Mar 2018 11:57:20 +0100 Subject: netfilter: nft_ct: add NFT_CT_{SRC,DST}_{IP,IP6} All existing keys, except the NFT_CT_SRC and NFT_CT_DST are assumed to have strict datatypes. This is causing problems with sets and concatenations given the specific length of these keys is not known. Signed-off-by: Pablo Neira Ayuso Acked-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 12 ++++++++-- net/netfilter/nft_ct.c | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66dceee0ae30..6a3d653d5b27 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -909,8 +909,8 @@ enum nft_rt_attributes { * @NFT_CT_EXPIRATION: relative conntrack expiration time in ms * @NFT_CT_HELPER: connection tracking helper assigned to conntrack * @NFT_CT_L3PROTOCOL: conntrack layer 3 protocol - * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address) - * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address) + * @NFT_CT_SRC: conntrack layer 3 protocol source (IPv4/IPv6 address, deprecated) + * @NFT_CT_DST: conntrack layer 3 protocol destination (IPv4/IPv6 address, deprecated) * @NFT_CT_PROTOCOL: conntrack layer 4 protocol * @NFT_CT_PROTO_SRC: conntrack layer 4 protocol source * @NFT_CT_PROTO_DST: conntrack layer 4 protocol destination @@ -920,6 +920,10 @@ enum nft_rt_attributes { * @NFT_CT_AVGPKT: conntrack average bytes per packet * @NFT_CT_ZONE: conntrack zone * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack + * @NFT_CT_SRC_IP: conntrack layer 3 protocol source (IPv4 address) + * @NFT_CT_DST_IP: conntrack layer 3 protocol destination (IPv4 address) + * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) + * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) */ enum nft_ct_keys { NFT_CT_STATE, @@ -941,6 +945,10 @@ enum nft_ct_keys { NFT_CT_AVGPKT, NFT_CT_ZONE, NFT_CT_EVENTMASK, + NFT_CT_SRC_IP, + NFT_CT_DST_IP, + NFT_CT_SRC_IP6, + NFT_CT_DST_IP6, }; /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6ab274b14484..ea737fd789e8 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -196,6 +196,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr, case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; + case NFT_CT_SRC_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->src.u3.ip; + return; + case NFT_CT_DST_IP: + if (nf_ct_l3num(ct) != NFPROTO_IPV4) + goto err; + *dest = tuple->dst.u3.ip; + return; + case NFT_CT_SRC_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); + return; + case NFT_CT_DST_IP6: + if (nf_ct_l3num(ct) != NFPROTO_IPV6) + goto err; + memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); + return; default: break; } @@ -419,6 +439,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, return -EAFNOSUPPORT; } break; + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip); + break; + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: + if (tb[NFTA_CT_DIRECTION] == NULL) + return -EINVAL; + + len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6); + break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) @@ -588,6 +622,10 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: + case NFT_CT_SRC_IP: + case NFT_CT_DST_IP: + case NFT_CT_SRC_IP6: + case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) -- cgit v1.2.3-70-g09d2 From 8039ab43eeac029a9c47c0411918ea82c9ce87cd Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 12 Mar 2018 18:14:42 -0500 Subject: netfilter: cttimeout: remove VLA usage In preparation to enabling -Wvla, remove VLA and replace it with dynamic memory allocation. >From a security viewpoint, the use of Variable Length Arrays can be a vector for stack overflow attacks. Also, in general, as the code evolves it is easy to lose track of how big a VLA can get. Thus, we can end up having segfaults that are hard to debug. Also, fixed as part of the directive to remove all VLAs from the kernel: https://lkml.org/lkml/2018/3/7/621 While at it, remove likely() notation which is not necessary from the control plane code. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cttimeout.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 95b04702a655..9ee5fa551fa6 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -51,19 +51,27 @@ ctnl_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { + struct nlattr **tb; int ret = 0; - if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) { - struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; + if (!l4proto->ctnl_timeout.nlattr_to_obj) + return 0; - ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, - attr, l4proto->ctnl_timeout.nla_policy, - NULL); - if (ret < 0) - return ret; + tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), + GFP_KERNEL); - ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); - } + if (!tb) + return -ENOMEM; + + ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, attr, + l4proto->ctnl_timeout.nla_policy, NULL); + if (ret < 0) + goto err; + + ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); + +err: + kfree(tb); return ret; } -- cgit v1.2.3-70-g09d2 From 1446385904add0e89f990ee0518434365e50ce86 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 12 Mar 2018 19:21:38 -0500 Subject: netfilter: nfnetlink_cthelper: Remove VLA usage In preparation to enabling -Wvla, remove VLA and replace it with dynamic memory allocation. >From a security viewpoint, the use of Variable Length Arrays can be a vector for stack overflow attacks. Also, in general, as the code evolves it is easy to lose track of how big a VLA can get. Thus, we can end up having segfaults that are hard to debug. Also, fixed as part of the directive to remove all VLAs from the kernel: https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_cthelper.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index d33ce6d5ebce..4a4b293fb2e5 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -314,23 +314,30 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], struct nf_conntrack_helper *helper) { - struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1]; + struct nf_conntrack_expect_policy *new_policy; struct nf_conntrack_expect_policy *policy; - int i, err; + int i, ret = 0; + + new_policy = kmalloc_array(helper->expect_class_max + 1, + sizeof(*new_policy), GFP_KERNEL); + if (!new_policy) + return -ENOMEM; /* Check first that all policy attributes are well-formed, so we don't * leave things in inconsistent state on errors. */ for (i = 0; i < helper->expect_class_max + 1; i++) { - if (!tb[NFCTH_POLICY_SET + i]) - return -EINVAL; + if (!tb[NFCTH_POLICY_SET + i]) { + ret = -EINVAL; + goto err; + } - err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], + ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i], &new_policy[i], tb[NFCTH_POLICY_SET + i]); - if (err < 0) - return err; + if (ret < 0) + goto err; } /* Now we can safely update them. */ for (i = 0; i < helper->expect_class_max + 1; i++) { @@ -340,7 +347,9 @@ static int nfnl_cthelper_update_policy_all(struct nlattr *tb[], policy->timeout = new_policy->timeout; } - return 0; +err: + kfree(new_policy); + return ret; } static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, -- cgit v1.2.3-70-g09d2 From 5b4c6e3860daaf089c28e0161dffef7b5ad8000f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 12 Mar 2018 22:16:17 -0500 Subject: netfilter: nf_tables: remove VLA usage In preparation to enabling -Wvla, remove VLA and replace it with dynamic memory allocation. >From a security viewpoint, the use of Variable Length Arrays can be a vector for stack overflow attacks. Also, in general, as the code evolves it is easy to lose track of how big a VLA can get. Thus, we can end up having segfaults that are hard to debug. Also, fixed as part of the directive to remove all VLAs from the kernel: https://lkml.org/lkml/2018/3/7/621 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8cc7fc970f0c..92f5606b0dea 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4357,16 +4357,20 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, const struct nft_object_type *type, const struct nlattr *attr) { - struct nlattr *tb[type->maxattr + 1]; + struct nlattr **tb; const struct nft_object_ops *ops; struct nft_object *obj; - int err; + int err = -ENOMEM; + + tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL); + if (!tb) + goto err1; if (attr) { err = nla_parse_nested(tb, type->maxattr, attr, type->policy, NULL); if (err < 0) - goto err1; + goto err2; } else { memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1)); } @@ -4375,7 +4379,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, ops = type->select_ops(ctx, (const struct nlattr * const *)tb); if (IS_ERR(ops)) { err = PTR_ERR(ops); - goto err1; + goto err2; } } else { ops = type->ops; @@ -4383,18 +4387,21 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, err = -ENOMEM; obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL); - if (obj == NULL) - goto err1; + if (!obj) + goto err2; err = ops->init(ctx, (const struct nlattr * const *)tb, obj); if (err < 0) - goto err2; + goto err3; obj->ops = ops; + kfree(tb); return obj; -err2: +err3: kfree(obj); +err2: + kfree(tb); err1: return ERR_PTR(err); } -- cgit v1.2.3-70-g09d2 From d72133e6288030121e425b89584ab3dfb68871cc Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 14 Mar 2018 23:36:53 +0900 Subject: netfilter: ebtables: use ADD_COUNTER macro xtables uses ADD_COUNTER macro to increase packet and byte count. ebtables also can use this. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 217aa79f7b2a..9a26d2b7420f 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -223,9 +223,7 @@ unsigned int ebt_do_table(struct sk_buff *skb, return NF_DROP; } - /* increase counter */ - (*(counter_base + i)).pcnt++; - (*(counter_base + i)).bcnt += skb->len; + ADD_COUNTER(*(counter_base + i), 1, skb->len); /* these should only watch: not modify, nor tell us * what to do with the packet @@ -968,10 +966,9 @@ static void get_counters(const struct ebt_counter *oldcounters, if (cpu == 0) continue; counter_base = COUNTER_BASE(oldcounters, nentries, cpu); - for (i = 0; i < nentries; i++) { - counters[i].pcnt += counter_base[i].pcnt; - counters[i].bcnt += counter_base[i].bcnt; - } + for (i = 0; i < nentries; i++) + ADD_COUNTER(counters[i], counter_base[i].pcnt, + counter_base[i].bcnt); } } @@ -1324,10 +1321,8 @@ static int do_update_counters(struct net *net, const char *name, write_lock_bh(&t->lock); /* we add to the counters of the first cpu */ - for (i = 0; i < num_counters; i++) { - t->private->counters[i].pcnt += tmp[i].pcnt; - t->private->counters[i].bcnt += tmp[i].bcnt; - } + for (i = 0; i < num_counters; i++) + ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt); write_unlock_bh(&t->lock); ret = 0; -- cgit v1.2.3-70-g09d2 From 472a73e00757b971d613d796374d2727b2e4954d Mon Sep 17 00:00:00 2001 From: Jack Ma Date: Mon, 19 Mar 2018 09:41:59 +1300 Subject: netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK targets. This patch introduces a new feature that allows bitshifting (left and right) operations to co-operate with existing iptables options. Reviewed-by: Florian Westphal Signed-off-by: Jack Ma Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_connmark.h | 10 ++++ net/netfilter/xt_connmark.c | 77 +++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index 408a9654f05c..1aa5c955ee1e 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -19,11 +19,21 @@ enum { XT_CONNMARK_RESTORE }; +enum { + D_SHIFT_LEFT = 0, + D_SHIFT_RIGHT, +}; + struct xt_connmark_tginfo1 { __u32 ctmark, ctmask, nfmask; __u8 mode; }; +struct xt_connmark_tginfo2 { + __u32 ctmark, ctmask, nfmask; + __u8 shift_dir, shift_bits, mode; +}; + struct xt_connmark_mtinfo1 { __u32 mark, mask; __u8 invert; diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 809639ce6f5a..773da82190dc 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -36,9 +36,10 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int -connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +connmark_tg_shift(struct sk_buff *skb, + const struct xt_connmark_tginfo1 *info, + u8 shift_bits, u8 shift_dir) { - const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u_int32_t newmark; @@ -50,6 +51,10 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) switch (info->mode) { case XT_CONNMARK_SET: newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -57,7 +62,11 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_SAVE: newmark = (ct->mark & ~info->ctmask) ^ - (skb->mark & info->nfmask); + (skb->mark & info->nfmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; if (ct->mark != newmark) { ct->mark = newmark; nf_conntrack_event_cache(IPCT_MARK, ct); @@ -65,14 +74,34 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) break; case XT_CONNMARK_RESTORE: newmark = (skb->mark & ~info->nfmask) ^ - (ct->mark & info->ctmask); + (ct->mark & info->ctmask); + if (shift_dir == D_SHIFT_RIGHT) + newmark >>= shift_bits; + else + newmark <<= shift_bits; skb->mark = newmark; break; } - return XT_CONTINUE; } +static unsigned int +connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo1 *info = par->targinfo; + + return connmark_tg_shift(skb, info, 0, 0); +} + +static unsigned int +connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_connmark_tginfo2 *info = par->targinfo; + + return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info, + info->shift_bits, info->shift_dir); +} + static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; @@ -119,15 +148,27 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par) nf_ct_netns_put(par->net, par->family); } -static struct xt_target connmark_tg_reg __read_mostly = { - .name = "CONNMARK", - .revision = 1, - .family = NFPROTO_UNSPEC, - .checkentry = connmark_tg_check, - .target = connmark_tg, - .targetsize = sizeof(struct xt_connmark_tginfo1), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, +static struct xt_target connmark_tg_reg[] __read_mostly = { + { + .name = "CONNMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg, + .targetsize = sizeof(struct xt_connmark_tginfo1), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + }, + { + .name = "CONNMARK", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = connmark_tg_check, + .target = connmark_tg_v2, + .targetsize = sizeof(struct xt_connmark_tginfo2), + .destroy = connmark_tg_destroy, + .me = THIS_MODULE, + } }; static struct xt_match connmark_mt_reg __read_mostly = { @@ -145,12 +186,14 @@ static int __init connmark_mt_init(void) { int ret; - ret = xt_register_target(&connmark_tg_reg); + ret = xt_register_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { - xt_unregister_target(&connmark_tg_reg); + xt_unregister_targets(connmark_tg_reg, + ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; @@ -159,7 +202,7 @@ static int __init connmark_mt_init(void) static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); - xt_unregister_target(&connmark_tg_reg); + xt_unregister_target(connmark_tg_reg); } module_init(connmark_mt_init); -- cgit v1.2.3-70-g09d2 From 5191d70f83fd1878c40029cffe69f6a2bf65fa0e Mon Sep 17 00:00:00 2001 From: Arushi Singhal Date: Mon, 12 Mar 2018 18:36:29 +0530 Subject: netfilter: Replace printk() with pr_*() and define pr_fmt() Using pr_() is more concise than printk(KERN_). This patch: * Replace printks having a log level with the appropriate pr_*() macros. * Define pr_fmt() to include relevant name. * Remove redundant prefixes from pr_*() calls. * Indent the code where possible. * Remove the useless output messages. * Remove periods from messages. Signed-off-by: Arushi Singhal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_acct.c | 6 ++++-- net/netfilter/nf_conntrack_ecache.c | 6 ++++-- net/netfilter/nf_conntrack_timestamp.c | 6 ++++-- net/netfilter/nf_nat_core.c | 4 +++- net/netfilter/nf_nat_ftp.c | 7 ++++--- net/netfilter/nf_nat_irc.c | 7 ++++--- net/netfilter/nfnetlink_queue.c | 14 +++++++------- net/netfilter/xt_time.c | 13 +++++++------ 8 files changed, 37 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 866916712905..1d66de5151b2 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -80,7 +82,7 @@ static int nf_conntrack_acct_init_sysctl(struct net *net) net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.acct_sysctl_header) { - printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -125,7 +127,7 @@ int nf_conntrack_acct_init(void) { int ret = nf_ct_extend_register(&acct_extend); if (ret < 0) - pr_err("nf_conntrack_acct: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index caac41ad9483..c11822a7d2bf 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -11,6 +11,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -372,7 +374,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net) net->ct.event_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.event_sysctl_header) { - printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -419,7 +421,7 @@ int nf_conntrack_ecache_init(void) { int ret = nf_ct_extend_register(&event_extend); if (ret < 0) - pr_err("nf_ct_event: Unable to register event extension.\n"); + pr_err("Unable to register event extension\n"); BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 4c4734b78318..56766cb26e40 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -6,6 +6,8 @@ * published by the Free Software Foundation (or any later at your option). */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -58,7 +60,7 @@ static int nf_conntrack_tstamp_init_sysctl(struct net *net) net->ct.tstamp_sysctl_header = register_net_sysctl(net, "net/netfilter", table); if (!net->ct.tstamp_sysctl_header) { - printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + pr_err("can't register to sysctl\n"); goto out_register; } return 0; @@ -104,7 +106,7 @@ int nf_conntrack_tstamp_init(void) int ret; ret = nf_ct_extend_register(&tstamp_extend); if (ret < 0) - pr_err("nf_ct_tstamp: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 6c38421e31f9..617693ff9f4c 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -814,7 +816,7 @@ static int __init nf_nat_init(void) ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); - printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); + pr_err("Unable to register extension\n"); return ret; } diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index d76afafdc699..5063cbf1689c 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -8,6 +8,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -71,7 +73,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN]; unsigned int buflen; - pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + pr_debug("type %i, off %u len %u\n", type, matchoff, matchlen); /* Connection will come from wherever this packet goes, hence !dir */ newaddr = ct->tuplehash[!dir].tuple.dst.u3; @@ -136,8 +138,7 @@ static int __init nf_nat_ftp_init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, const struct kernel_param *kp) { - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } module_param_call(ports, warn_set, NULL, NULL, 0); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index dcb5f6375d9d..3aa35a43100d 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -10,6 +10,8 @@ * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -79,7 +81,7 @@ static unsigned int help(struct sk_buff *skb, */ /* AAA = "us", ie. where server normally talks to. */ snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port); - pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", + pr_debug("inserting '%s' == %pI4, port %u\n", buffer, &newaddr.ip, port); if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, @@ -108,8 +110,7 @@ static int __init nf_nat_irc_init(void) /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ static int warn_set(const char *val, const struct kernel_param *kp) { - printk(KERN_INFO KBUILD_MODNAME - ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); return 0; } module_param_call(ports, warn_set, NULL, NULL, 0); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8bba23160a68..74a04638ef03 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -14,6 +14,9 @@ * published by the Free Software Foundation. * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -833,11 +836,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); - if (!nskb) { - printk(KERN_WARNING "nf_queue: OOM " - "in mangle, dropping packet\n"); + if (!nskb) return -ENOMEM; - } kfree_skb(e->skb); e->skb = nskb; } @@ -1536,20 +1536,20 @@ static int __init nfnetlink_queue_init(void) status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { - pr_err("nf_queue: failed to register pernet ops\n"); + pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { - pr_err("nf_queue: failed to create netlink socket\n"); + pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { - pr_err("nf_queue: failed to register netdevice notifier\n"); + pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 0160f505e337..c13bcd0ab491 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -9,6 +9,9 @@ * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from gnu.org/gpl. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -266,13 +269,11 @@ static int __init time_mt_init(void) int minutes = sys_tz.tz_minuteswest; if (minutes < 0) /* east of Greenwich */ - printk(KERN_INFO KBUILD_MODNAME - ": kernel timezone is +%02d%02d\n", - -minutes / 60, -minutes % 60); + pr_info("kernel timezone is +%02d%02d\n", + -minutes / 60, -minutes % 60); else /* west of Greenwich */ - printk(KERN_INFO KBUILD_MODNAME - ": kernel timezone is -%02d%02d\n", - minutes / 60, minutes % 60); + pr_info("kernel timezone is -%02d%02d\n", + minutes / 60, minutes % 60); return xt_register_match(&xt_time_mt_reg); } -- cgit v1.2.3-70-g09d2 From 20710b3b81895c89e92bcc32ce85c0bede1171f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 Mar 2018 12:33:51 +0100 Subject: netfilter: ctnetlink: synproxy support This patch exposes synproxy information per-conntrack. Moreover, send sequence adjustment events once server sends us the SYN,ACK packet, so we can synchronize the sequence adjustment too for packets going as reply from the server, as part of the synproxy logic. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_common.h | 1 + include/uapi/linux/netfilter/nfnetlink_conntrack.h | 10 +++ net/ipv4/netfilter/ipt_SYNPROXY.c | 8 +- net/ipv6/netfilter/ip6t_SYNPROXY.c | 8 +- net/netfilter/nf_conntrack_netlink.c | 87 +++++++++++++++++++++- 5 files changed, 109 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 9574bd40870b..c712eb6879f1 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -129,6 +129,7 @@ enum ip_conntrack_events { IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ + IPCT_SYNPROXY, /* synproxy has been set */ #ifdef __KERNEL__ __IPCT_MAX #endif diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 7397e022ce6e..77987111cab0 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -54,6 +54,7 @@ enum ctattr_type { CTA_MARK_MASK, CTA_LABELS, CTA_LABELS_MASK, + CTA_SYNPROXY, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -190,6 +191,15 @@ enum ctattr_natseq { }; #define CTA_NAT_SEQ_MAX (__CTA_NAT_SEQ_MAX - 1) +enum ctattr_synproxy { + CTA_SYNPROXY_UNSPEC, + CTA_SYNPROXY_ISN, + CTA_SYNPROXY_ITS, + CTA_SYNPROXY_TSOFF, + __CTA_SYNPROXY_MAX, +}; +#define CTA_SYNPROXY_MAX (__CTA_SYNPROXY_MAX - 1) + enum ctattr_expect { CTA_EXPECT_UNSPEC, CTA_EXPECT_MASTER, diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index f75fc6b53115..690b17ef6a44 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -16,6 +16,7 @@ #include #include #include +#include static struct iphdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, @@ -384,6 +385,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -392,8 +395,10 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -403,6 +408,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 437af8c95277..cb6d42b03cb5 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -18,6 +18,7 @@ #include #include #include +#include static struct ipv6hdr * synproxy_build_ip(struct net *net, struct sk_buff *skb, @@ -405,6 +406,8 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy->isn = ntohl(th->ack_seq); if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy->its = opts.tsecr; + + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); break; case TCP_CONNTRACK_SYN_RECV: if (!th->syn || !th->ack) @@ -413,8 +416,10 @@ static unsigned int ipv6_synproxy_hook(void *priv, if (!synproxy_parse_options(skb, thoff, th, &opts)) return NF_DROP; - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) + if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { synproxy->tsoff = opts.tsval - synproxy->its; + nf_conntrack_event_cache(IPCT_SYNPROXY, ct); + } opts.options &= ~(XT_SYNPROXY_OPT_MSS | XT_SYNPROXY_OPT_WSCALE | @@ -424,6 +429,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); + nf_conntrack_event_cache(IPCT_SEQADJ, ct); swap(opts.tsval, opts.tsecr); synproxy_send_client_ack(net, skb, th, &opts); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8884d302d33a..11ef85a57244 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -440,6 +440,31 @@ err: return -1; } +static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *nest_parms; + + if (!synproxy) + return 0; + + nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED); + if (!nest_parms) + goto nla_put_failure; + + if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) || + nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) || + nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff))) + goto nla_put_failure; + + nla_nest_end(skb, nest_parms); + + return 0; + +nla_put_failure: + return -1; +} + static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) @@ -518,7 +543,8 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, ctnetlink_dump_id(skb, ct) < 0 || ctnetlink_dump_use(skb, ct) < 0 || ctnetlink_dump_master(skb, ct) < 0 || - ctnetlink_dump_ct_seq_adj(skb, ct) < 0) + ctnetlink_dump_ct_seq_adj(skb, ct) < 0 || + ctnetlink_dump_ct_synproxy(skb, ct) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -730,6 +756,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_SEQADJ) && ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + + if (events & (1 << IPCT_SYNPROXY) && + ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; } #ifdef CONFIG_NF_CONNTRACK_MARK @@ -1689,6 +1719,39 @@ err: return ret; } +static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = { + [CTA_SYNPROXY_ISN] = { .type = NLA_U32 }, + [CTA_SYNPROXY_ITS] = { .type = NLA_U32 }, + [CTA_SYNPROXY_TSOFF] = { .type = NLA_U32 }, +}; + +static int ctnetlink_change_synproxy(struct nf_conn *ct, + const struct nlattr * const cda[]) +{ + struct nf_conn_synproxy *synproxy = nfct_synproxy(ct); + struct nlattr *tb[CTA_SYNPROXY_MAX + 1]; + int err; + + if (!synproxy) + return 0; + + err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY], + synproxy_policy, NULL); + if (err < 0) + return err; + + if (!tb[CTA_SYNPROXY_ISN] || + !tb[CTA_SYNPROXY_ITS] || + !tb[CTA_SYNPROXY_TSOFF]) + return -EINVAL; + + synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN])); + synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS])); + synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF])); + + return 0; +} + static int ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1759,6 +1822,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct, return err; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + return err; + } + if (cda[CTA_LABELS]) { err = ctnetlink_attach_labels(ct, cda); if (err < 0) @@ -1880,6 +1949,12 @@ ctnetlink_create_conntrack(struct net *net, goto err2; } + if (cda[CTA_SYNPROXY]) { + err = ctnetlink_change_synproxy(ct, cda); + if (err < 0) + goto err2; + } + #if defined(CONFIG_NF_CONNTRACK_MARK) if (cda[CTA_MARK]) ct->mark = ntohl(nla_get_be32(cda[CTA_MARK])); @@ -1991,7 +2066,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_HELPER) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK) | events, + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY) | + events, ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); nf_ct_put(ct); @@ -2012,7 +2089,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_LABEL) | (1 << IPCT_PROTOINFO) | (1 << IPCT_SEQADJ) | - (1 << IPCT_MARK), + (1 << IPCT_MARK) | + (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } @@ -2282,6 +2360,9 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) ctnetlink_dump_ct_seq_adj(skb, ct) < 0) goto nla_put_failure; + if (ctnetlink_dump_ct_synproxy(skb, ct) < 0) + goto nla_put_failure; + #ifdef CONFIG_NF_CONNTRACK_MARK if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0) goto nla_put_failure; -- cgit v1.2.3-70-g09d2 From 5adc1668ddc42bb44fd6d006cacad74ed0cbf49d Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 4 Mar 2018 09:28:53 +0100 Subject: netfilter: ebtables: add support for matching ICMP type and code We already have ICMPv6 type/code matches. This adds support for IPv4 ICMP matches in the same way. Signed-off-by: Matthias Schiffer Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebt_ip.h | 13 +++++++-- net/bridge/netfilter/ebt_ip.c | 43 +++++++++++++++++++++------- 2 files changed, 43 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip.h b/include/uapi/linux/netfilter_bridge/ebt_ip.h index 8e462fb1983f..4ed7fbb0a482 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip.h @@ -24,8 +24,9 @@ #define EBT_IP_PROTO 0x08 #define EBT_IP_SPORT 0x10 #define EBT_IP_DPORT 0x20 +#define EBT_IP_ICMP 0x40 #define EBT_IP_MASK (EBT_IP_SOURCE | EBT_IP_DEST | EBT_IP_TOS | EBT_IP_PROTO |\ - EBT_IP_SPORT | EBT_IP_DPORT ) + EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP) #define EBT_IP_MATCH "ip" /* the same values are used for the invflags */ @@ -38,8 +39,14 @@ struct ebt_ip_info { __u8 protocol; __u8 bitmask; __u8 invflags; - __u16 sport[2]; - __u16 dport[2]; + union { + __u16 sport[2]; + __u8 icmp_type[2]; + }; + union { + __u16 dport[2]; + __u8 icmp_code[2]; + }; }; #endif diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 2b46c50abce0..8cb8f8395768 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -19,9 +19,15 @@ #include #include -struct tcpudphdr { - __be16 src; - __be16 dst; +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; }; static bool @@ -30,8 +36,8 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_ip_info *info = par->matchinfo; const struct iphdr *ih; struct iphdr _iph; - const struct tcpudphdr *pptr; - struct tcpudphdr _ports; + const union pkthdr *pptr; + union pkthdr _pkthdr; ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) @@ -50,29 +56,38 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->bitmask & EBT_IP_PROTO) { if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; - if (!(info->bitmask & EBT_IP_DPORT) && - !(info->bitmask & EBT_IP_SPORT)) + if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | + EBT_IP_ICMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; + + /* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, - sizeof(_ports), &_ports); + sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP_DPORT) { - u32 dst = ntohs(pptr->dst); + u32 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP_SPORT) { - u32 src = ntohs(pptr->src); + u32 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } + if ((info->bitmask & EBT_IP_ICMP) && + NF_INVF(info, EBT_IP_ICMP, + pptr->icmphdr.type < info->icmp_type[0] || + pptr->icmphdr.type > info->icmp_type[1] || + pptr->icmphdr.code < info->icmp_code[0] || + pptr->icmphdr.code > info->icmp_code[1])) + return false; } return true; } @@ -101,6 +116,14 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; + if (info->bitmask & EBT_IP_ICMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_ICMP) + return -EINVAL; + if (info->icmp_type[0] > info->icmp_type[1] || + info->icmp_code[0] > info->icmp_code[1]) + return -EINVAL; + } return 0; } -- cgit v1.2.3-70-g09d2 From 78d9f4d49bbecd101b4e5faf19f8f70719fee2ca Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 4 Mar 2018 09:28:54 +0100 Subject: netfilter: ebtables: add support for matching IGMP type We already have ICMPv6 type/code matches (which can be used to distinguish different types of MLD packets). Add support for IPv4 IGMP matches in the same way. Signed-off-by: Matthias Schiffer Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebt_ip.h | 4 +++- net/bridge/netfilter/ebt_ip.c | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter_bridge/ebt_ip.h b/include/uapi/linux/netfilter_bridge/ebt_ip.h index 4ed7fbb0a482..46d6261370b0 100644 --- a/include/uapi/linux/netfilter_bridge/ebt_ip.h +++ b/include/uapi/linux/netfilter_bridge/ebt_ip.h @@ -25,8 +25,9 @@ #define EBT_IP_SPORT 0x10 #define EBT_IP_DPORT 0x20 #define EBT_IP_ICMP 0x40 +#define EBT_IP_IGMP 0x80 #define EBT_IP_MASK (EBT_IP_SOURCE | EBT_IP_DEST | EBT_IP_TOS | EBT_IP_PROTO |\ - EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP) + EBT_IP_SPORT | EBT_IP_DPORT | EBT_IP_ICMP | EBT_IP_IGMP) #define EBT_IP_MATCH "ip" /* the same values are used for the invflags */ @@ -42,6 +43,7 @@ struct ebt_ip_info { union { __u16 sport[2]; __u8 icmp_type[2]; + __u8 igmp_type[2]; }; union { __u16 dport[2]; diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c index 8cb8f8395768..ffaa8ce2e724 100644 --- a/net/bridge/netfilter/ebt_ip.c +++ b/net/bridge/netfilter/ebt_ip.c @@ -28,6 +28,9 @@ union pkthdr { u8 type; u8 code; } icmphdr; + struct { + u8 type; + } igmphdr; }; static bool @@ -57,12 +60,12 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol)) return false; if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT | - EBT_IP_ICMP))) + EBT_IP_ICMP | EBT_IP_IGMP))) return true; if (ntohs(ih->frag_off) & IP_OFFSET) return false; - /* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */ + /* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, ih->ihl*4, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) @@ -88,6 +91,11 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par) pptr->icmphdr.code < info->icmp_code[0] || pptr->icmphdr.code > info->icmp_code[1])) return false; + if ((info->bitmask & EBT_IP_IGMP) && + NF_INVF(info, EBT_IP_IGMP, + pptr->igmphdr.type < info->igmp_type[0] || + pptr->igmphdr.type > info->igmp_type[1])) + return false; } return true; } @@ -124,6 +132,13 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par) info->icmp_code[0] > info->icmp_code[1]) return -EINVAL; } + if (info->bitmask & EBT_IP_IGMP) { + if ((info->invflags & EBT_IP_PROTO) || + info->protocol != IPPROTO_IGMP) + return -EINVAL; + if (info->igmp_type[0] > info->igmp_type[1]) + return -EINVAL; + } return 0; } -- cgit v1.2.3-70-g09d2 From 1ad22fb5bb53ce6bf5377f25529c0a007c61c6f5 Mon Sep 17 00:00:00 2001 From: Tosoni Date: Wed, 14 Mar 2018 16:58:34 +0000 Subject: mac80211: inform wireless layer when frame RSSI is invalid When the low-level driver returns an invalid RSSI indication, set the signal value to 0 as an indication to the upper layer. Also, skip average level computation if signal is invalid. Signed-off-by: Jean Pierre TOSONI Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 160 ++++++++++++++++++++++++++++------------------------ net/mac80211/rx.c | 6 +- net/mac80211/scan.c | 4 +- 3 files changed, 93 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0024eff9bb84..ea3ba03fb952 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3306,82 +3306,14 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_HT_OPERATION) | (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); -static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, - struct ieee80211_mgmt *mgmt, size_t len, - struct ieee80211_rx_status *rx_status) +static void ieee80211_handle_beacon_sig(struct ieee80211_sub_if_data *sdata, + struct ieee80211_if_managed *ifmgd, + struct ieee80211_bss_conf *bss_conf, + struct ieee80211_local *local, + struct ieee80211_rx_status *rx_status) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; - size_t baselen; - struct ieee802_11_elems elems; - struct ieee80211_local *local = sdata->local; - struct ieee80211_chanctx_conf *chanctx_conf; - struct ieee80211_channel *chan; - struct sta_info *sta; - u32 changed = 0; - bool erp_valid; - u8 erp_value = 0; - u32 ncrc; - u8 *bssid; - u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; - - sdata_assert_lock(sdata); - - /* Process beacon from the current BSS */ - baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; - if (baselen > len) - return; - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!chanctx_conf) { - rcu_read_unlock(); - return; - } - - if (rx_status->freq != chanctx_conf->def.chan->center_freq) { - rcu_read_unlock(); - return; - } - chan = chanctx_conf->def.chan; - rcu_read_unlock(); - - if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && - ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { - ieee802_11_parse_elems(mgmt->u.beacon.variable, - len - baselen, false, &elems); - - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); - if (elems.tim && !elems.parse_error) { - const struct ieee80211_tim_ie *tim_ie = elems.tim; - ifmgd->dtim_period = tim_ie->dtim_period; - } - ifmgd->have_beacon = true; - ifmgd->assoc_data->need_beacon = false; - if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { - sdata->vif.bss_conf.sync_tsf = - le64_to_cpu(mgmt->u.beacon.timestamp); - sdata->vif.bss_conf.sync_device_ts = - rx_status->device_timestamp; - if (elems.tim) - sdata->vif.bss_conf.sync_dtim_count = - elems.tim->dtim_count; - else - sdata->vif.bss_conf.sync_dtim_count = 0; - } - /* continue assoc process */ - ifmgd->assoc_data->timeout = jiffies; - ifmgd->assoc_data->timeout_started = true; - run_again(sdata, ifmgd->assoc_data->timeout); - return; - } - - if (!ifmgd->associated || - !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) - return; - bssid = ifmgd->associated->bssid; - /* Track average RSSI from the Beacon frames of the current AP */ + if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; ewma_beacon_signal_init(&ifmgd->ave_beacon_signal); @@ -3468,6 +3400,86 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, sig, GFP_KERNEL); } } +} + +static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len, + struct ieee80211_rx_status *rx_status) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + size_t baselen; + struct ieee802_11_elems elems; + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_channel *chan; + struct sta_info *sta; + u32 changed = 0; + bool erp_valid; + u8 erp_value = 0; + u32 ncrc; + u8 *bssid; + u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN]; + + sdata_assert_lock(sdata); + + /* Process beacon from the current BSS */ + baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt; + if (baselen > len) + return; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (!chanctx_conf) { + rcu_read_unlock(); + return; + } + + if (rx_status->freq != chanctx_conf->def.chan->center_freq) { + rcu_read_unlock(); + return; + } + chan = chanctx_conf->def.chan; + rcu_read_unlock(); + + if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && + ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) { + ieee802_11_parse_elems(mgmt->u.beacon.variable, + len - baselen, false, &elems); + + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); + if (elems.tim && !elems.parse_error) { + const struct ieee80211_tim_ie *tim_ie = elems.tim; + ifmgd->dtim_period = tim_ie->dtim_period; + } + ifmgd->have_beacon = true; + ifmgd->assoc_data->need_beacon = false; + if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { + sdata->vif.bss_conf.sync_tsf = + le64_to_cpu(mgmt->u.beacon.timestamp); + sdata->vif.bss_conf.sync_device_ts = + rx_status->device_timestamp; + if (elems.tim) + sdata->vif.bss_conf.sync_dtim_count = + elems.tim->dtim_count; + else + sdata->vif.bss_conf.sync_dtim_count = 0; + } + /* continue assoc process */ + ifmgd->assoc_data->timeout = jiffies; + ifmgd->assoc_data->timeout_started = true; + run_again(sdata, ifmgd->assoc_data->timeout); + return; + } + + if (!ifmgd->associated || + !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid)) + return; + bssid = ifmgd->associated->bssid; + + if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)) + ieee80211_handle_beacon_sig(sdata, ifmgd, bss_conf, + local, rx_status); if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 9c898a3688c6..27bb1f0b5e52 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2804,7 +2804,8 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { int sig = 0; - if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) && + !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; cfg80211_report_obss_beacon(rx->local->hw.wiphy, @@ -3145,7 +3146,8 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * it transmitted were processed or returned. */ - if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM)) + if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) && + !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ef2becaade50..a3b1bcc2b461 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -73,7 +73,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; - if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) + if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) + bss_meta.signal = 0; /* invalid signal indication */ + else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; -- cgit v1.2.3-70-g09d2 From 2cb021f5de55b1d158fa18b0215a4613c3289a82 Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Thu, 1 Mar 2018 12:39:16 +0300 Subject: cfg80211/nl80211: add CAC_STARTED event CAC_STARTED event is needed for DFS offload feature and should be generated by driver/HW if DFS_OFFLOAD is enabled. Signed-off-by: Dmitry Lebed Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ net/wireless/mlme.c | 7 +++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c13c84304be3..b8e989073cbb 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5204,6 +5204,8 @@ enum nl80211_smps_mode { * non-operating channel is expired and no longer valid. New CAC must * be done on this channel before starting the operation. This is not * applicable for ETSI dfs domain where pre-CAC is valid for ever. + * @NL80211_RADAR_CAC_STARTED: Channel Availability Check has been started, + * should be generated by HW if NL80211_EXT_FEATURE_DFS_OFFLOAD is enabled. */ enum nl80211_radar_event { NL80211_RADAR_DETECTED, @@ -5211,6 +5213,7 @@ enum nl80211_radar_event { NL80211_RADAR_CAC_ABORTED, NL80211_RADAR_NOP_FINISHED, NL80211_RADAR_PRE_CAC_EXPIRED, + NL80211_RADAR_CAC_STARTED, }; /** diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index bbb9907bfa86..6b6818dd76bd 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -888,14 +888,17 @@ void cfg80211_cac_event(struct net_device *netdev, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); - break; + /* fall through */ case NL80211_RADAR_CAC_ABORTED: + wdev->cac_started = false; + break; + case NL80211_RADAR_CAC_STARTED: + wdev->cac_started = true; break; default: WARN_ON(1); return; } - wdev->cac_started = false; nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } -- cgit v1.2.3-70-g09d2 From 13cf6dec93e021ebd297619ea1926aea31b6430b Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Thu, 1 Mar 2018 12:39:15 +0300 Subject: cfg80211/nl80211: add DFS offload flag Add wiphy EXT_FEATURE flag to indicate that HW or driver does all DFS actions by itself. User-space functionality already implemented in hostapd using vendor-specific (QCA) OUI to advertise DFS offload support. Need to introduce generic flag to inform about DFS offload support. For devices with DFS_OFFLOAD flag set user-space will no longer need to issue CAC or do any actions in response to "radar detected" events. HW will do everything by itself and send events to user-space to indicate that CAC was started/finished, etc. Signed-off-by: Dmitrii Lebed Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 +++++++ net/wireless/nl80211.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b8e989073cbb..60fefc5a2ea4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4999,6 +4999,12 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_LOW_SPAN_SCAN: Driver supports low span scan. * @NL80211_EXT_FEATURE_LOW_POWER_SCAN: Driver supports low power scan. * @NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN: Driver supports high accuracy scan. + * @NL80211_EXT_FEATURE_DFS_OFFLOAD: HW/driver will offload DFS actions. + * Device or driver will do all DFS-related actions by itself, + * informing user-space about CAC progress, radar detection event, + * channel change triggered by radar detection event. + * No need to start CAC from user-space, no need to react to + * "radar detected" event. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5029,6 +5035,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_LOW_SPAN_SCAN, NL80211_EXT_FEATURE_LOW_POWER_SCAN, NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, + NL80211_EXT_FEATURE_DFS_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a910150f8169..fe27ab443d01 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7551,12 +7551,13 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; int err; - dfs_region = reg_get_dfs_region(wdev->wiphy); + dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) return -EINVAL; @@ -7570,17 +7571,20 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, if (wdev->cac_started) return -EBUSY; - err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, - wdev->iftype); + err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) return err; if (err == 0) return -EINVAL; - if (!cfg80211_chandef_dfs_usable(wdev->wiphy, &chandef)) + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) return -EINVAL; + /* CAC start is offloaded to HW and can't be started manually */ + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) + return -EOPNOTSUPP; + if (!rdev->ops->start_radar_detection) return -EOPNOTSUPP; -- cgit v1.2.3-70-g09d2 From 1ae7762760736d4f7e4ea43e9ed03a608685c3d9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 14:39:05 +0300 Subject: net: Convert can_pernet_ops These pernet_operations create and destroy /proc entries and cancel per-net timer. Also, there are unneed iterations over empty list of net devices, since all net devices must be already moved to init_net or unregistered by default_device_ops. This already was mentioned here: https://marc.info/?l=linux-can&m=150169589119335&w=2 So, it looks safe to make them async. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/can/af_can.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 6da324550eec..e899970398a1 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -954,6 +954,7 @@ static struct notifier_block can_netdev_notifier __read_mostly = { static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, + .async = true, }; static __init int can_init(void) -- cgit v1.2.3-70-g09d2 From 08012631d6277cebe388971cd84fe14d1cc49a00 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 14:45:37 +0300 Subject: net: Convert lowpan_frags_ops These pernet_operations register and unregister sysctl. Also, there is inet_frags_exit_net() called in exit method, which has to be safe after a560002437d3 "net: Fix hlist corruptions in inet_evict_bucket()". Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ieee802154/6lowpan/reassembly.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 85bf86ad6b18..a9ccb1322f69 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -603,6 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .exit = lowpan_frags_exit_net, + .async = true, }; int __init lowpan_net_frag_init(void) -- cgit v1.2.3-70-g09d2 From aa65f636540539e2e1fd77bdcd8fc7060d19d47b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 14:45:46 +0300 Subject: net: Convert nf_ct_net_ops These pernet_operations register and unregister sysctl. Also, there is inet_frags_exit_net() called in exit method, which has to be safe after a560002437d3 "net: Fix hlist corruptions in inet_evict_bucket()". Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b84ce3e6d728..34136fe80ed5 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -646,6 +646,7 @@ static void nf_ct_net_exit(struct net *net) static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .exit = nf_ct_net_exit, + .async = true, }; int nf_ct_frag6_init(void) -- cgit v1.2.3-70-g09d2 From bdf5bd7f21323493dbe5f2c723dc33f2fbb0241a Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 19 Mar 2018 06:52:48 -0700 Subject: rds: tcp: remove register_netdevice_notifier infrastructure. The netns deletion path does not need to wait for all net_devices to be unregistered before dismantling rds_tcp state for the netns (we are able to dismantle this state on module unload even when all net_devices are active so there is no dependency here). This patch removes code related to netdevice notifiers and refactors all the code needed to dismantle rds_tcp state into a ->exit callback for the pernet_operations used with register_pernet_device(). Signed-off-by: Sowmini Varadhan Reviewed-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/rds/tcp.c | 93 +++++++++++++++-------------------------------------------- 1 file changed, 23 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 08ea9cd5c2f6..4f3a32c38bf5 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -485,40 +485,6 @@ fail: return err; } -static void __net_exit rds_tcp_exit_net(struct net *net) -{ - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - - if (rtn->rds_tcp_sysctl) - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - - if (net != &init_net && rtn->ctl_table) - kfree(rtn->ctl_table); - - /* If rds_tcp_exit_net() is called as a result of netns deletion, - * the rds_tcp_kill_sock() device notifier would already have cleaned - * up the listen socket, thus there is no work to do in this function. - * - * If rds_tcp_exit_net() is called as a result of module unload, - * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then - * we do need to clean up the listen socket here. - */ - if (rtn->rds_tcp_listen_sock) { - struct socket *lsock = rtn->rds_tcp_listen_sock; - - rtn->rds_tcp_listen_sock = NULL; - rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); - } -} - -static struct pernet_operations rds_tcp_net_ops = { - .init = rds_tcp_init_net, - .exit = rds_tcp_exit_net, - .id = &rds_tcp_netid, - .size = sizeof(struct rds_tcp_net), - .async = true, -}; - static void rds_tcp_kill_sock(struct net *net) { struct rds_tcp_connection *tc, *_tc; @@ -546,40 +512,38 @@ static void rds_tcp_kill_sock(struct net *net) rds_conn_destroy(tc->t_cpath->cp_conn); } -void *rds_tcp_listen_sock_def_readable(struct net *net) +static void __net_exit rds_tcp_exit_net(struct net *net) { struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - struct socket *lsock = rtn->rds_tcp_listen_sock; - if (!lsock) - return NULL; + rds_tcp_kill_sock(net); - return lsock->sk->sk_user_data; + if (rtn->rds_tcp_sysctl) + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + + if (net != &init_net && rtn->ctl_table) + kfree(rtn->ctl_table); } -static int rds_tcp_dev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static struct pernet_operations rds_tcp_net_ops = { + .init = rds_tcp_init_net, + .exit = rds_tcp_exit_net, + .id = &rds_tcp_netid, + .size = sizeof(struct rds_tcp_net), + .async = true, +}; + +void *rds_tcp_listen_sock_def_readable(struct net *net) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + struct socket *lsock = rtn->rds_tcp_listen_sock; - /* rds-tcp registers as a pernet subys, so the ->exit will only - * get invoked after network acitivity has quiesced. We need to - * clean up all sockets to quiesce network activity, and use - * the unregistration of the per-net loopback device as a trigger - * to start that cleanup. - */ - if (event == NETDEV_UNREGISTER_FINAL && - dev->ifindex == LOOPBACK_IFINDEX) - rds_tcp_kill_sock(dev_net(dev)); + if (!lsock) + return NULL; - return NOTIFY_DONE; + return lsock->sk->sk_user_data; } -static struct notifier_block rds_tcp_dev_notifier = { - .notifier_call = rds_tcp_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - /* when sysctl is used to modify some kernel socket parameters,this * function resets the RDS connections in that netns so that we can * restart with new parameters. The assumption is that such reset @@ -625,9 +589,7 @@ static void rds_tcp_exit(void) rds_tcp_set_unloading(); synchronize_rcu(); rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); - unregister_pernet_subsys(&rds_tcp_net_ops); - if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) - pr_warn("could not unregister rds_tcp_dev_notifier\n"); + unregister_pernet_device(&rds_tcp_net_ops); rds_tcp_destroy_conns(); rds_trans_unregister(&rds_tcp_transport); rds_tcp_recv_exit(); @@ -651,24 +613,15 @@ static int rds_tcp_init(void) if (ret) goto out_slab; - ret = register_pernet_subsys(&rds_tcp_net_ops); + ret = register_pernet_device(&rds_tcp_net_ops); if (ret) goto out_recv; - ret = register_netdevice_notifier(&rds_tcp_dev_notifier); - if (ret) { - pr_warn("could not register rds_tcp_dev_notifier\n"); - goto out_pernet; - } - rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; - -out_pernet: - unregister_pernet_subsys(&rds_tcp_net_ops); out_recv: rds_tcp_recv_exit(); out_slab: -- cgit v1.2.3-70-g09d2 From 145307460ba9c11489807de7acd3f4c7395f60b7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 20 Mar 2018 19:31:14 -0700 Subject: devlink: Remove top_hierarchy arg to devlink_resource_register top_hierarchy arg can be determined by comparing parent_resource_id to DEVLINK_RESOURCE_ID_PARENT_TOP so it does not need to be a separate argument. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 9 ++++----- drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c | 6 +++--- include/net/devlink.h | 1 - net/core/devlink.c | 4 +++- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index a120602bca26..83886a9df206 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3876,8 +3876,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) kvd_size = MLXSW_CORE_RES_GET(mlxsw_core, KVD_SIZE); err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD, - true, kvd_size, - MLXSW_SP_RESOURCE_KVD, + kvd_size, MLXSW_SP_RESOURCE_KVD, DEVLINK_RESOURCE_ID_PARENT_TOP, &kvd_size_params, NULL); @@ -3886,7 +3885,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) linear_size = profile->kvd_linear_size; err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR, - false, linear_size, + linear_size, MLXSW_SP_RESOURCE_KVD_LINEAR, MLXSW_SP_RESOURCE_KVD, &linear_size_params, @@ -3904,7 +3903,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) profile->kvd_hash_single_parts; double_size = rounddown(double_size, profile->kvd_hash_granularity); err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_DOUBLE, - false, double_size, + double_size, MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE, MLXSW_SP_RESOURCE_KVD, &hash_double_size_params, @@ -3914,7 +3913,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) single_size = kvd_size - double_size - linear_size; err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_HASH_SINGLE, - false, single_size, + single_size, MLXSW_SP_RESOURCE_KVD_HASH_SINGLE, MLXSW_SP_RESOURCE_KVD, &hash_single_size_params, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c index 4c9bff2fa055..85503e93b93f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c @@ -459,7 +459,7 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink) mlxsw_sp_kvdl_resource_size_params_prepare(devlink); err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_SINGLES, - false, MLXSW_SP_KVDL_SINGLE_SIZE, + MLXSW_SP_KVDL_SINGLE_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE, MLXSW_SP_RESOURCE_KVD_LINEAR, &mlxsw_sp_kvdl_single_size_params, @@ -468,7 +468,7 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink) return err; err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_CHUNKS, - false, MLXSW_SP_KVDL_CHUNKS_SIZE, + MLXSW_SP_KVDL_CHUNKS_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS, MLXSW_SP_RESOURCE_KVD_LINEAR, &mlxsw_sp_kvdl_chunks_size_params, @@ -477,7 +477,7 @@ int mlxsw_sp_kvdl_resources_register(struct devlink *devlink) return err; err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD_LINEAR_LARGE_CHUNKS, - false, MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE, + MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS, MLXSW_SP_RESOURCE_KVD_LINEAR, &mlxsw_sp_kvdl_large_chunks_size_params, diff --git a/include/net/devlink.h b/include/net/devlink.h index c83125ad20ff..d5b707375e48 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -406,7 +406,6 @@ extern struct devlink_dpipe_header devlink_dpipe_header_ipv6; int devlink_resource_register(struct devlink *devlink, const char *resource_name, - bool top_hierarchy, u64 resource_size, u64 resource_id, u64 parent_resource_id, diff --git a/net/core/devlink.c b/net/core/devlink.c index f23e5ed7c90f..d03b96f87c25 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3174,7 +3174,6 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, - bool top_hierarchy, u64 resource_size, u64 resource_id, u64 parent_resource_id, @@ -3183,8 +3182,11 @@ int devlink_resource_register(struct devlink *devlink, { struct devlink_resource *resource; struct list_head *resource_list; + bool top_hierarchy; int err = 0; + top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP; + mutex_lock(&devlink->lock); resource = devlink_resource_find(devlink, NULL, resource_id); if (resource) { -- cgit v1.2.3-70-g09d2 From dfde331e757fd792e1c9579b72a8370ca665e5ed Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Wed, 21 Mar 2018 14:37:43 +0100 Subject: tipc: modify socket iterator for sock_diag The current socket iterator function tipc_nl_sk_dump, handles socket locks and calls __tipc_nl_add_sk for each socket. To reuse this logic in sock_diag implementation, we do minor modifications to make these functions generic as described below. In this commit, we add a two new functions __tipc_nl_sk_walk, __tipc_nl_add_sk_info and modify tipc_nl_sk_dump, __tipc_nl_add_sk accordingly. In __tipc_nl_sk_walk we: 1. acquire and release socket locks 2. for each socket, execute the specified callback function In __tipc_nl_add_sk we: - Move the netlink attribute insertion to __tipc_nl_add_sk_info. tipc_nl_sk_dump calls tipc_nl_sk_walk with __tipc_nl_add_sk as argument. sock_diag will use these generic functions in a later commit. There is no functional change in this commit. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- net/tipc/socket.c | 65 +++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a4a9148d4629..35ac0f5b9529 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3160,16 +3160,33 @@ msg_full: return -EMSGSIZE; } +static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock + *tsk) +{ + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); + struct sock *sk = &tsk->sk; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) || + nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr)) + return -EMSGSIZE; + + if (tipc_sk_connected(sk)) { + if (__tipc_nl_add_sk_con(skb, tsk)) + return -EMSGSIZE; + } else if (!list_empty(&tsk->publications)) { + if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) + return -EMSGSIZE; + } + return 0; +} + /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { - int err; - void *hdr; struct nlattr *attrs; - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sock *sk = &tsk->sk; + void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET); @@ -3179,19 +3196,10 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, attrs = nla_nest_start(skb, TIPC_NLA_SOCK); if (!attrs) goto genlmsg_cancel; - if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid)) - goto attr_msg_cancel; - if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr)) + + if (__tipc_nl_add_sk_info(skb, tsk)) goto attr_msg_cancel; - if (tipc_sk_connected(sk)) { - err = __tipc_nl_add_sk_con(skb, tsk); - if (err) - goto attr_msg_cancel; - } else if (!list_empty(&tsk->publications)) { - if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) - goto attr_msg_cancel; - } nla_nest_end(skb, attrs); genlmsg_end(skb, hdr); @@ -3205,16 +3213,19 @@ msg_cancel: return -EMSGSIZE; } -int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int __tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)) { - int err; - struct tipc_sock *tsk; - const struct bucket_table *tbl; - struct rhash_head *pos; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - u32 tbl_id = cb->args[0]; + struct tipc_net *tn = tipc_net(net); + const struct bucket_table *tbl; u32 prev_portid = cb->args[1]; + u32 tbl_id = cb->args[0]; + struct rhash_head *pos; + struct tipc_sock *tsk; + int err; rcu_read_lock(); tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht); @@ -3226,12 +3237,13 @@ int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - err = __tipc_nl_add_sk(skb, cb, tsk); + err = skb_handler(skb, cb, tsk); if (err) { prev_portid = tsk->portid; spin_unlock_bh(&tsk->sk.sk_lock.slock); goto out; } + prev_portid = 0; spin_unlock_bh(&tsk->sk.sk_lock.slock); } @@ -3244,6 +3256,11 @@ out: return skb->len; } +int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + return __tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); +} + /* Caller should hold socket lock for the passed tipc socket. */ static int __tipc_nl_add_sk_publ(struct sk_buff *skb, struct netlink_callback *cb, -- cgit v1.2.3-70-g09d2 From c30b70deb5f4861f590031c33fd3ec6cc63f1df1 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Wed, 21 Mar 2018 14:37:44 +0100 Subject: tipc: implement socket diagnostics for AF_TIPC This commit adds socket diagnostics capability for AF_TIPC in netlink family NETLINK_SOCK_DIAG in a new kernel module (diag.ko). The following are key design considerations: - config TIPC_DIAG has default y, like INET_DIAG. - only requests with flag NLM_F_DUMP is supported (dump all). - tipc_sock_diag_req message is introduced to send filter parameters. - the response attributes are of TLV, some nested. To avoid exposing data structures between diag and tipc modules and avoid code duplication, the following additions are required: - export tipc_nl_sk_walk function to reuse socket iterator. - export tipc_sk_fill_sock_diag to fill the tipc diag attributes. - create a sock_diag response message in __tipc_add_sock_diag defined in diag.c and use the above exported tipc_sk_fill_sock_diag to fill response. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 18 ++++++ include/uapi/linux/tipc_sockets_diag.h | 17 +++++ net/tipc/Kconfig | 8 +++ net/tipc/Makefile | 5 ++ net/tipc/diag.c | 114 +++++++++++++++++++++++++++++++++ net/tipc/socket.c | 72 +++++++++++++++++++-- net/tipc/socket.h | 10 ++- 7 files changed, 238 insertions(+), 6 deletions(-) create mode 100644 include/uapi/linux/tipc_sockets_diag.h create mode 100644 net/tipc/diag.c (limited to 'net') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 469aa67a5ecb..d7cec0480d70 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -114,6 +114,13 @@ enum { TIPC_NLA_SOCK_REF, /* u32 */ TIPC_NLA_SOCK_CON, /* nest */ TIPC_NLA_SOCK_HAS_PUBL, /* flag */ + TIPC_NLA_SOCK_STAT, /* nest */ + TIPC_NLA_SOCK_TYPE, /* u32 */ + TIPC_NLA_SOCK_INO, /* u32 */ + TIPC_NLA_SOCK_UID, /* u32 */ + TIPC_NLA_SOCK_TIPC_STATE, /* u32 */ + TIPC_NLA_SOCK_COOKIE, /* u64 */ + TIPC_NLA_SOCK_PAD, /* flag */ __TIPC_NLA_SOCK_MAX, TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 @@ -238,6 +245,17 @@ enum { TIPC_NLA_CON_MAX = __TIPC_NLA_CON_MAX - 1 }; +/* Nest, socket statistics info */ +enum { + TIPC_NLA_SOCK_STAT_RCVQ, /* u32 */ + TIPC_NLA_SOCK_STAT_SENDQ, /* u32 */ + TIPC_NLA_SOCK_STAT_LINK_CONG, /* flag */ + TIPC_NLA_SOCK_STAT_CONN_CONG, /* flag */ + + __TIPC_NLA_SOCK_STAT_MAX, + TIPC_NLA_SOCK_STAT_MAX = __TIPC_NLA_SOCK_STAT_MAX - 1 +}; + /* Nest, link propreties. Valid for link, media and bearer */ enum { TIPC_NLA_PROP_UNSPEC, diff --git a/include/uapi/linux/tipc_sockets_diag.h b/include/uapi/linux/tipc_sockets_diag.h new file mode 100644 index 000000000000..7678cf2f0dcc --- /dev/null +++ b/include/uapi/linux/tipc_sockets_diag.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* AF_TIPC sock_diag interface for querying open sockets */ + +#ifndef _UAPI__TIPC_SOCKETS_DIAG_H__ +#define _UAPI__TIPC_SOCKETS_DIAG_H__ + +#include +#include + +/* Request */ +struct tipc_sock_diag_req { + __u8 sdiag_family; /* must be AF_TIPC */ + __u8 sdiag_protocol; /* must be 0 */ + __u16 pad; /* must be 0 */ + __u32 tidiag_states; /* query*/ +}; +#endif /* _UAPI__TIPC_SOCKETS_DIAG_H__ */ diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index c25a3a149dc4..e450212121d2 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -34,3 +34,11 @@ config TIPC_MEDIA_UDP Saying Y here will enable support for running TIPC over IP/UDP bool default y + +config TIPC_DIAG + tristate "TIPC: socket monitoring interface" + depends on TIPC + default y + ---help--- + Support for TIPC socket monitoring interface used by ss tool. + If unsure, say Y. diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 1edb7192aa2f..aca168f2abb1 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -14,3 +14,8 @@ tipc-y += addr.o bcast.o bearer.o \ tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o + + +obj-$(CONFIG_TIPC_DIAG) += diag.o + +tipc_diag-y := diag.o diff --git a/net/tipc/diag.c b/net/tipc/diag.c new file mode 100644 index 000000000000..46d9cd62f781 --- /dev/null +++ b/net/tipc/diag.c @@ -0,0 +1,114 @@ +/* + * net/tipc/diag.c: TIPC socket diag + * + * Copyright (c) 2018, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "socket.h" +#include +#include + +static u64 __tipc_diag_gen_cookie(struct sock *sk) +{ + u32 res[2]; + + sock_diag_save_cookie(sk, res); + return *((u64 *)res); +} + +static int __tipc_add_sock_diag(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk) +{ + struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); + struct nlmsghdr *nlh; + int err; + + nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, + NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states, + __tipc_diag_gen_cookie); + if (err) + return err; + + nlmsg_end(skb, nlh); + return 0; +} + +static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); +} + +static int tipc_sock_diag_handler_dump(struct sk_buff *skb, + struct nlmsghdr *h) +{ + int hdrlen = sizeof(struct tipc_sock_diag_req); + struct net *net = sock_net(skb->sk); + + if (nlmsg_len(h) < hdrlen) + return -EINVAL; + + if (h->nlmsg_flags & NLM_F_DUMP) { + struct netlink_dump_control c = { + .dump = tipc_diag_dump, + }; + netlink_dump_start(net->diag_nlsk, skb, h, &c); + return 0; + } + return -EOPNOTSUPP; +} + +static const struct sock_diag_handler tipc_sock_diag_handler = { + .family = AF_TIPC, + .dump = tipc_sock_diag_handler_dump, +}; + +static int __init tipc_diag_init(void) +{ + return sock_diag_register(&tipc_sock_diag_handler); +} + +static void __exit tipc_diag_exit(void) +{ + sock_diag_unregister(&tipc_sock_diag_handler); +} + +module_init(tipc_diag_init); +module_exit(tipc_diag_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 35ac0f5b9529..07559ce4b8ba 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3213,10 +3213,10 @@ msg_cancel: return -EMSGSIZE; } -static int __tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, - int (*skb_handler)(struct sk_buff *skb, - struct netlink_callback *cb, - struct tipc_sock *tsk)) +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); @@ -3255,10 +3255,72 @@ out: return skb->len; } +EXPORT_SYMBOL(tipc_nl_sk_walk); + +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)) +{ + struct sock *sk = &tsk->sk; + struct nlattr *attrs; + struct nlattr *stat; + + /*filter response w.r.t sk_state*/ + if (!(sk_filter_state & (1 << sk->sk_state))) + return 0; + + attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + if (!attrs) + goto msg_cancel; + + if (__tipc_nl_add_sk_info(skb, tsk)) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) || + nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) || + nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || + nla_put_u32(skb, TIPC_NLA_SOCK_UID, + from_kuid_munged(sk_user_ns(sk), sock_i_uid(sk))) || + nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, + tipc_diag_gen_cookie(sk), + TIPC_NLA_SOCK_PAD)) + goto attr_msg_cancel; + + stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT); + if (!stat) + goto attr_msg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, + skb_queue_len(&sk->sk_receive_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, + skb_queue_len(&sk->sk_write_queue))) + goto stat_msg_cancel; + + if (tsk->cong_link_cnt && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG)) + goto stat_msg_cancel; + + if (tsk_conn_cong(tsk) && + nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG)) + goto stat_msg_cancel; + + nla_nest_end(skb, stat); + nla_nest_end(skb, attrs); + + return 0; + +stat_msg_cancel: + nla_nest_cancel(skb, stat); +attr_msg_cancel: + nla_nest_cancel(skb, attrs); +msg_cancel: + return -EMSGSIZE; +} +EXPORT_SYMBOL(tipc_sk_fill_sock_diag); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) { - return __tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); + return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk); } /* Caller should hold socket lock for the passed tipc socket. */ diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 06fb5944cf76..aae3fd4cd06c 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -49,6 +49,8 @@ #define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) #define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) +struct tipc_sock; + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); @@ -59,5 +61,11 @@ int tipc_sk_rht_init(struct net *net); void tipc_sk_rht_destroy(struct net *net); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb); - +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, + u32 sk_filter_state, + u64 (*tipc_diag_gen_cookie)(struct sock *sk)); +int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, + int (*skb_handler)(struct sk_buff *skb, + struct netlink_callback *cb, + struct tipc_sock *tsk)); #endif -- cgit v1.2.3-70-g09d2 From 872619d8cf810c17279335ef531a2a34f3b4e589 Mon Sep 17 00:00:00 2001 From: GhantaKrishnamurthy MohanKrishna Date: Wed, 21 Mar 2018 14:37:45 +0100 Subject: tipc: step sk->sk_drops when rcv buffer is full Currently when tipc is unable to queue a received message on a socket, the message is rejected back to the sender with error TIPC_ERR_OVERLOAD. However, the application on this socket has no knowledge about these discards. In this commit, we try to step the sk_drops counter when tipc is unable to queue a received message. Export sk_drops using tipc socket diagnostics. Acked-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: GhantaKrishnamurthy MohanKrishna Signed-off-by: Parthasarathy Bhuvaragan Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/socket.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d7cec0480d70..d896ded51bcb 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -251,6 +251,7 @@ enum { TIPC_NLA_SOCK_STAT_SENDQ, /* u32 */ TIPC_NLA_SOCK_STAT_LINK_CONG, /* flag */ TIPC_NLA_SOCK_STAT_CONN_CONG, /* flag */ + TIPC_NLA_SOCK_STAT_DROP, /* u32 */ __TIPC_NLA_SOCK_STAT_MAX, TIPC_NLA_SOCK_STAT_MAX = __TIPC_NLA_SOCK_STAT_MAX - 1 diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 07559ce4b8ba..732ec894f69f 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2122,8 +2122,10 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, (!sk_conn && msg_connected(hdr)) || (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; - else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { + atomic_inc(&sk->sk_drops); err = TIPC_ERR_OVERLOAD; + } if (unlikely(err)) { tipc_skb_reject(net, err, skb, xmitq); @@ -2202,6 +2204,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); + atomic_inc(&sk->sk_drops); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) __skb_queue_tail(xmitq, skb); break; @@ -3293,7 +3296,9 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ, skb_queue_len(&sk->sk_receive_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, - skb_queue_len(&sk->sk_write_queue))) + skb_queue_len(&sk->sk_write_queue)) || + nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, + atomic_read(&sk->sk_drops))) goto stat_msg_cancel; if (tsk->cong_link_cnt && -- cgit v1.2.3-70-g09d2 From 1574639411b3ce311b846cae7fda56bf1af7d027 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Mar 2018 19:34:58 +0000 Subject: gre: fix TUNNEL_SEQ bit check on sequence numbering The current logic of flags | TUNNEL_SEQ is always non-zero and hence sequence numbers are always incremented no matter the setting of the TUNNEL_SEQ bit. Fix this by using & instead of |. Detected by CoverityScan, CID#1466039 ("Operands don't affect result") Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.") Signed-off-by: Colin Ian King Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 2 +- net/ipv6/ip6_gre.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2fa2ef2e2af9..9ab1aa2f7660 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -550,7 +550,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ); gre_build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key32(tun_info->key.tun_id), - (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7d8775c9570d..6adbcf40cf8c 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -724,7 +724,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb, gre_build_header(skb, tunnel->tun_hlen, flags, protocol, tunnel_id_to_key32(tun_info->key.tun_id), - (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) + (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0); } else { -- cgit v1.2.3-70-g09d2 From 76d3e153d0d16032ab68e3b698c4df05acef48c5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 12:45:02 +0300 Subject: net: Revert "ipv4: get rid of ip_ra_lock" This reverts commit ba3f571d5dde. The commit was made after 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control", and killed ip_ra_lock, which became useless after rtnl_lock() made used to destroy every raw ipv4 socket. This scales very bad, and next patch in series reverts 1215e51edad1. ip_ra_lock will be used again. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 74c962b9b09c..be7c3b71914d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -334,6 +334,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, sent to multicast group to reach destination designated router. */ struct ip_ra_chain __rcu *ip_ra_chain; +static DEFINE_SPINLOCK(ip_ra_lock); static void ip_ra_destroy_rcu(struct rcu_head *head) @@ -355,17 +356,21 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + spin_lock_bh(&ip_ra_lock); for (rap = &ip_ra_chain; - (ra = rtnl_dereference(*rap)) != NULL; + (ra = rcu_dereference_protected(*rap, + lockdep_is_held(&ip_ra_lock))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { + spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); + spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); @@ -379,14 +384,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (!new_ra) + if (!new_ra) { + spin_unlock_bh(&ip_ra_lock); return -ENOBUFS; + } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); + spin_unlock_bh(&ip_ra_lock); return 0; } -- cgit v1.2.3-70-g09d2 From 0526947f9dd019da8d550ed20177585a84b3460e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 12:45:12 +0300 Subject: net: Move IP_ROUTER_ALERT out of lock_sock(sk) ip_ra_control() does not need sk_lock. Who are the another users of ip_ra_chain? ip_mroute_setsockopt() doesn't take sk_lock, while parallel IP_ROUTER_ALERT syscalls are synchronized by ip_ra_lock. So, we may move this command out of sk_lock. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index be7c3b71914d..dcbf6afe27e7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -647,6 +647,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* If optlen==0, it is equivalent to val == 0 */ + if (optname == IP_ROUTER_ALERT) + return ip_ra_control(sk, val ? 1 : 0, NULL); if (ip_mroute_opt(optname)) return ip_mroute_setsockopt(sk, optname, optval, optlen); @@ -1157,9 +1159,6 @@ mc_msf_out: goto e_inval; inet->mc_all = val; break; - case IP_ROUTER_ALERT: - err = ip_ra_control(sk, val ? 1 : 0, NULL); - break; case IP_FREEBIND: if (optlen < 1) -- cgit v1.2.3-70-g09d2 From 128aaa98ad1422eacc4c74879840cb3e579cd934 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 12:45:22 +0300 Subject: net: Revert "ipv4: fix a deadlock in ip_ra_control" This reverts commit 1215e51edad1. Since raw_close() is used on every RAW socket destruction, the changes made by 1215e51edad1 scale sadly. This clearly seen on endless unshare(CLONE_NEWNET) test, and cleanup_net() kwork spends a lot of time waiting for rtnl_lock() introduced by this commit. Previous patch moved IP_ROUTER_ALERT out of rtnl_lock(), so we revert this patch. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 1 - net/ipv4/ipmr.c | 11 +++++++++-- net/ipv4/raw.c | 2 -- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index dcbf6afe27e7..bf5f44b27b7e 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -594,7 +594,6 @@ static bool setsockopt_needs_rtnl(int optname) case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: - case IP_ROUTER_ALERT: return true; } return false; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d752a70855d8..f6be5db16da2 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk) struct net *net = sock_net(sk); struct mr_table *mrt; - ASSERT_RTNL(); + rtnl_lock(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; @@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk) mroute_clean_tables(mrt, false); } } + rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole @@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { + /* We need to unlock here because mrtsock_destruct takes + * care of rtnl itself and we can't change that due to + * the IP_ROUTER_ALERT setsockopt which runs without it. + */ + rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); - goto out_unlock; + goto out; } break; case MRT_ADD_VIF: @@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, } out_unlock: rtnl_unlock(); +out: return ret; } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 54648d20bf0f..720bef7da2f6 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout) /* * Raw sockets may have direct kernel references. Kill them. */ - rtnl_lock(); ip_ra_control(sk, 0, NULL); - rtnl_unlock(); sk_common_release(sk); } -- cgit v1.2.3-70-g09d2 From 5796ef75ec7b6019eac88f66751d663d537a5cd3 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 12:45:32 +0300 Subject: net: Make ip_ra_chain per struct net This is optimization, which makes ip_call_ra_chain() iterate less sockets to find the sockets it's looking for. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/ip.h | 13 +++++++++++-- include/net/netns/ipv4.h | 1 + net/ipv4/ip_input.c | 5 ++--- net/ipv4/ip_sockglue.c | 15 ++------------- 4 files changed, 16 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index fe63ba95d12b..d53b5a9eae34 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -91,6 +91,17 @@ static inline int inet_sdif(struct sk_buff *skb) return 0; } +/* Special input handler for packets caught by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ + struct ip_ra_chain { struct ip_ra_chain __rcu *next; struct sock *sk; @@ -101,8 +112,6 @@ struct ip_ra_chain { struct rcu_head rcu; }; -extern struct ip_ra_chain __rcu *ip_ra_chain; - /* IP flags. */ #define IP_CE 0x8000 /* Flag: "Congestion" */ #define IP_DF 0x4000 /* Flag: "Don't Fragment" */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 382bfd7583cf..97d7ee6667c7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -49,6 +49,7 @@ struct netns_ipv4 { #endif struct ipv4_devconf *devconf_all; struct ipv4_devconf *devconf_dflt; + struct ip_ra_chain __rcu *ra_chain; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 57fc13c6ab2b..7582713dd18f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) struct net_device *dev = skb->dev; struct net *net = dev_net(dev); - for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { + for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb) */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || - sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), net)) { + sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index bf5f44b27b7e..f36d35fe924b 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -322,18 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, return 0; } - -/* Special input handler for packets caught by router alert option. - They are selected only by protocol field, and then processed likely - local ones; but only if someone wants them! Otherwise, router - not running rsvpd will kill RSVP. - - It is user level problem, what it will make with them. - I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), - but receiver should be enough clever f.e. to forward mtrace requests, - sent to multicast group to reach destination designated router. - */ -struct ip_ra_chain __rcu *ip_ra_chain; static DEFINE_SPINLOCK(ip_ra_lock); @@ -350,6 +338,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, { struct ip_ra_chain *ra, *new_ra; struct ip_ra_chain __rcu **rap; + struct net *net = sock_net(sk); if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; @@ -357,7 +346,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; spin_lock_bh(&ip_ra_lock); - for (rap = &ip_ra_chain; + for (rap = &net->ipv4.ra_chain; (ra = rcu_dereference_protected(*rap, lockdep_is_held(&ip_ra_lock))) != NULL; rap = &ra->next) { -- cgit v1.2.3-70-g09d2 From d9ff3049739e349b5380b96226f9ad766741773d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 12:45:40 +0300 Subject: net: Replace ip_ra_lock with per-net mutex Since ra_chain is per-net, we may use per-net mutexes to protect them in ip_ra_control(). This improves scalability. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + net/core/net_namespace.c | 1 + net/ipv4/ip_sockglue.c | 15 ++++++--------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 97d7ee6667c7..8491bc9c86b1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -50,6 +50,7 @@ struct netns_ipv4 { struct ipv4_devconf *devconf_all; struct ipv4_devconf *devconf_dflt; struct ip_ra_chain __rcu *ra_chain; + struct mutex ra_mutex; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; bool fib_has_custom_rules; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c340d5cfbdec..95ba2c53bd9a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) net->user_ns = user_ns; idr_init(&net->netns_ids); spin_lock_init(&net->nsid_lock); + mutex_init(&net->ipv4.ra_mutex); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f36d35fe924b..5ad2d8ed3a3f 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -322,9 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, return 0; } -static DEFINE_SPINLOCK(ip_ra_lock); - - static void ip_ra_destroy_rcu(struct rcu_head *head) { struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu); @@ -345,21 +342,21 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - spin_lock_bh(&ip_ra_lock); + mutex_lock(&net->ipv4.ra_mutex); for (rap = &net->ipv4.ra_chain; (ra = rcu_dereference_protected(*rap, - lockdep_is_held(&ip_ra_lock))) != NULL; + lockdep_is_held(&net->ipv4.ra_mutex))) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { - spin_unlock_bh(&ip_ra_lock); + mutex_unlock(&net->ipv4.ra_mutex); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); - spin_unlock_bh(&ip_ra_lock); + mutex_unlock(&net->ipv4.ra_mutex); if (ra->destructor) ra->destructor(sk); @@ -374,7 +371,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, } } if (!new_ra) { - spin_unlock_bh(&ip_ra_lock); + mutex_unlock(&net->ipv4.ra_mutex); return -ENOBUFS; } new_ra->sk = sk; @@ -383,7 +380,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - spin_unlock_bh(&ip_ra_lock); + mutex_unlock(&net->ipv4.ra_mutex); return 0; } -- cgit v1.2.3-70-g09d2 From dcbe73ca55a42712bfd0e9966cd2d5a48355ace3 Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Chitrapu Date: Thu, 22 Mar 2018 12:18:03 -0700 Subject: mac80211: notify driver for change in multicast rates With drivers implementing rate control in driver or firmware rate_control_send_low() may not get called, and thus the driver needs to know about changes in the multicast rate. Add and use a new BSS change flag for this. Signed-off-by: Pradeep Kumar Chitrapu [rewrite commit message] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ net/mac80211/cfg.c | 2 ++ net/mac80211/ibss.c | 2 +- net/mac80211/mesh.c | 3 ++- net/mac80211/util.c | 3 ++- 5 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2fd59ed3be00..d39fd6838f41 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -302,6 +302,8 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected * keep alive) changed. + * @BSS_CHANGED_MCAST_RATE: Multicast Rate setting changed for this interface + * */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -329,6 +331,7 @@ enum ieee80211_bss_change { BSS_CHANGED_OCB = 1<<22, BSS_CHANGED_MU_GROUPS = 1<<23, BSS_CHANGED_KEEP_ALIVE = 1<<24, + BSS_CHANGED_MCAST_RATE = 1<<25, /* when adding here, make sure to change ieee80211_reconfig */ }; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fd68f6fb02d7..5c4b105ca398 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2313,6 +2313,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, memcpy(sdata->vif.bss_conf.mcast_rate, rate, sizeof(int) * NUM_NL80211_BANDS); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MCAST_RATE); + return 0; } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index db07e0de9a03..dc582aa35c89 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1839,7 +1839,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED | IEEE80211_HT_PARAM_RIFS_MODE; - changed |= BSS_CHANGED_HT; + changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE; ieee80211_bss_info_change_notify(sdata, changed); sdata->smps_mode = IEEE80211_SMPS_OFF; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6a381cbe1e33..d51da26e9c18 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -880,7 +880,8 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata) BSS_CHANGED_BEACON_ENABLED | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | - BSS_CHANGED_BEACON_INT; + BSS_CHANGED_BEACON_INT | + BSS_CHANGED_MCAST_RATE; local->fif_other_bss++; /* mesh ifaces must set allmulti to forward mcast traffic */ diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 1f82191ce601..55cd2922627a 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1968,7 +1968,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_IDLE | - BSS_CHANGED_TXPOWER; + BSS_CHANGED_TXPOWER | + BSS_CHANGED_MCAST_RATE; if (sdata->vif.mu_mimo_owner) changed |= BSS_CHANGED_MU_GROUPS; -- cgit v1.2.3-70-g09d2 From 419d14af9e07fb5ca32b1b1614793c6b1e242152 Mon Sep 17 00:00:00 2001 From: Chas Williams <3chas3@gmail.com> Date: Thu, 22 Mar 2018 11:34:06 -0400 Subject: bridge: Allow max MTU when multiple VLANs present If the bridge is allowing multiple VLANs, some VLANs may have different MTUs. Instead of choosing the minimum MTU for the bridge interface, choose the maximum MTU of the bridge members. With this the user only needs to set a larger MTU on the member ports that are participating in the large MTU VLANS. Signed-off-by: Chas Williams <3chas3@gmail.com> Reviewed-by: Nikolay Aleksandrov Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br.c | 2 +- net/bridge/br_device.c | 2 +- net/bridge/br_if.c | 26 ++++++++++++++++++++++---- net/bridge/br_private.h | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 7770481a6506..a3f95ab9d6a3 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v switch (event) { case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_min_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br)); break; case NETDEV_CHANGEADDR: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 1285ca30ab0a..278fc999d355 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -224,7 +224,7 @@ static void br_get_stats64(struct net_device *dev, static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - if (new_mtu > br_min_mtu(br)) + if (new_mtu > br_mtu(br)) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 9ba4ed65c52b..48dc4d2e2be3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -424,8 +424,18 @@ int br_del_bridge(struct net *net, const char *name) return ret; } +static bool min_mtu(int a, int b) +{ + return a < b ? 1 : 0; +} + +static bool max_mtu(int a, int b) +{ + return a > b ? 1 : 0; +} + /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ -int br_min_mtu(const struct net_bridge *br) +static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int)) { const struct net_bridge_port *p; int mtu = 0; @@ -436,13 +446,21 @@ int br_min_mtu(const struct net_bridge *br) mtu = ETH_DATA_LEN; else { list_for_each_entry(p, &br->port_list, list) { - if (!mtu || p->dev->mtu < mtu) + if (!mtu || compare_fn(p->dev->mtu, mtu)) mtu = p->dev->mtu; } } return mtu; } +int br_mtu(const struct net_bridge *br) +{ + if (br->vlan_enabled) + return __br_mtu(br, max_mtu); + else + return __br_mtu(br, min_mtu); +} + static void br_set_gso_limits(struct net_bridge *br) { unsigned int gso_max_size = GSO_MAX_SIZE; @@ -594,7 +612,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - dev_set_mtu(br->dev, br_min_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br)); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -641,7 +659,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); - dev_set_mtu(br->dev, br_min_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br)); br_set_gso_limits(br); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8e13a64d8c99..048d5b51813b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -578,7 +578,7 @@ int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); -int br_min_mtu(const struct net_bridge *br); +int br_mtu(const struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); -- cgit v1.2.3-70-g09d2 From 69ca9293e8dd9323c6cde579e1855d6ce9489a46 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:09:53 -0700 Subject: tls: Generalize zerocopy_from_iter Refactor zerocopy_from_iter to take arguments for pages and size, such that it can be used for both tx and rx. RX will also support zerocopy direct to output iter, as long as the full message can be copied at once (a large enough userspace buffer was provided). Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 057a558ed6d7..ca1d20de3d2c 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -226,23 +226,24 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags) } static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, - int length) + int length, int *pages_used, + unsigned int *size_used, + struct scatterlist *to, int to_max_pages, + bool charge) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); struct page *pages[MAX_SKB_FRAGS]; size_t offset; ssize_t copied, use; int i = 0; - unsigned int size = ctx->sg_plaintext_size; - int num_elem = ctx->sg_plaintext_num_elem; + unsigned int size = *size_used; + int num_elem = *pages_used; int rc = 0; int maxpages; while (length > 0) { i = 0; - maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem; + maxpages = to_max_pages - num_elem; if (maxpages == 0) { rc = -EFAULT; goto out; @@ -262,10 +263,11 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, while (copied) { use = min_t(int, copied, PAGE_SIZE - offset); - sg_set_page(&ctx->sg_plaintext_data[num_elem], + sg_set_page(&to[num_elem], pages[i], use, offset); - sg_unmark_end(&ctx->sg_plaintext_data[num_elem]); - sk_mem_charge(sk, use); + sg_unmark_end(&to[num_elem]); + if (charge) + sk_mem_charge(sk, use); offset = 0; copied -= use; @@ -276,8 +278,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from, } out: - ctx->sg_plaintext_size = size; - ctx->sg_plaintext_num_elem = num_elem; + *size_used = size; + *pages_used = num_elem; + return rc; } @@ -374,7 +377,11 @@ alloc_encrypted: if (full_record || eor) { ret = zerocopy_from_iter(sk, &msg->msg_iter, - try_to_copy); + try_to_copy, &ctx->sg_plaintext_num_elem, + &ctx->sg_plaintext_size, + ctx->sg_plaintext_data, + ARRAY_SIZE(ctx->sg_plaintext_data), + true); if (ret) goto fallback_to_reg_send; -- cgit v1.2.3-70-g09d2 From dbe425599ba05c7415f632e6f5f018453098eb69 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:06 -0700 Subject: tls: Move cipher info to a separate struct Separate tx crypto parameters to a separate cipher_context struct. The same parameters will be used for rx using the same struct. tls_advance_record_sn is modified to only take the cipher info. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 26 +++++++++++++----------- net/tls/tls_main.c | 8 ++++---- net/tls/tls_sw.c | 58 ++++++++++++++++++++++++++++-------------------------- 3 files changed, 49 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 4913430ab807..019e52db1817 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -81,6 +81,16 @@ enum { TLS_PENDING_CLOSED_RECORD }; +struct cipher_context { + u16 prepend_size; + u16 tag_size; + u16 overhead_size; + u16 iv_size; + char *iv; + u16 rec_seq_size; + char *rec_seq; +}; + struct tls_context { union { struct tls_crypto_info crypto_send; @@ -91,13 +101,7 @@ struct tls_context { u8 tx_conf:2; - u16 prepend_size; - u16 tag_size; - u16 overhead_size; - u16 iv_size; - char *iv; - u16 rec_seq_size; - char *rec_seq; + struct cipher_context tx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; @@ -190,7 +194,7 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len) } static inline void tls_advance_record_sn(struct sock *sk, - struct tls_context *ctx) + struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) tls_err_abort(sk); @@ -203,9 +207,9 @@ static inline void tls_fill_prepend(struct tls_context *ctx, size_t plaintext_len, unsigned char record_type) { - size_t pkt_len, iv_size = ctx->iv_size; + size_t pkt_len, iv_size = ctx->tx.iv_size; - pkt_len = plaintext_len + iv_size + ctx->tag_size; + pkt_len = plaintext_len + iv_size + ctx->tx.tag_size; /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE @@ -217,7 +221,7 @@ static inline void tls_fill_prepend(struct tls_context *ctx, buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; memcpy(buf + TLS_NONCE_OFFSET, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } static inline void tls_make_aad(char *buf, diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d824d548447e..c671560b832b 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) } } - kfree(ctx->rec_seq); - kfree(ctx->iv); + kfree(ctx->tx.rec_seq); + kfree(ctx->tx.iv); if (ctx->tx_conf == TLS_SW_TX) tls_sw_free_tx_resources(sk); @@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, } lock_sock(sk); memcpy(crypto_info_aes_gcm_128->iv, - ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); - memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq, + memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq, TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE); release_sock(sk); if (copy_to_user(optval, diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index ca1d20de3d2c..338d743bcc21 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size) target_size); if (target_size > 0) - target_size += tls_ctx->overhead_size; + target_size += tls_ctx->tx.overhead_size; trim_sg(sk, ctx->sg_encrypted_data, &ctx->sg_encrypted_num_elem, @@ -152,21 +152,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx, if (!aead_req) return -ENOMEM; - ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size; aead_request_set_tfm(aead_req, ctx->aead_send); aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out, - data_len, tls_ctx->iv); + data_len, tls_ctx->tx.iv); aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &ctx->async_wait); rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait); - ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size; - ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size; + ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size; + ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size; kfree(aead_req); return rc; @@ -183,7 +183,7 @@ static int tls_push_record(struct sock *sk, int flags, sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1); tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size, - tls_ctx->rec_seq, tls_ctx->rec_seq_size, + tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, record_type); tls_fill_prepend(tls_ctx, @@ -216,7 +216,7 @@ static int tls_push_record(struct sock *sk, int flags, if (rc < 0 && rc != -EAGAIN) tls_err_abort(sk); - tls_advance_record_sn(sk, tls_ctx); + tls_advance_record_sn(sk, &tls_ctx->tx); return rc; } @@ -357,7 +357,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } required_size = ctx->sg_plaintext_size + try_to_copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -420,7 +420,7 @@ alloc_plaintext: &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, ctx->sg_plaintext_size + - tls_ctx->overhead_size); + tls_ctx->tx.overhead_size); } ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); @@ -512,7 +512,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, full_record = true; } required_size = ctx->sg_plaintext_size + copy + - tls_ctx->overhead_size; + tls_ctx->tx.overhead_size; if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -644,24 +644,26 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tag_size = tag_size; - ctx->overhead_size = ctx->prepend_size + ctx->tag_size; - ctx->iv_size = iv_size; - ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); - if (!ctx->iv) { + ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; + ctx->tx.tag_size = tag_size; + ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; + ctx->tx.iv_size = iv_size; + ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!ctx->tx.iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->rec_seq_size = rec_seq_size; - ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->rec_seq) { + memcpy(ctx->tx.iv, gcm_128_info->salt, + TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + ctx->tx.rec_seq_size = rec_seq_size; + ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->rec_seq, rec_seq, rec_seq_size); + memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); sg_init_table(sw_ctx->sg_encrypted_data, ARRAY_SIZE(sw_ctx->sg_encrypted_data)); @@ -697,7 +699,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size); + rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size); if (!rc) return 0; @@ -705,11 +707,11 @@ free_aead: crypto_free_aead(sw_ctx->aead_send); sw_ctx->aead_send = NULL; free_rec_seq: - kfree(ctx->rec_seq); - ctx->rec_seq = NULL; + kfree(ctx->tx.rec_seq); + ctx->tx.rec_seq = NULL; free_iv: - kfree(ctx->iv); - ctx->iv = NULL; + kfree(ctx->tx.iv); + ctx->tx.iv = NULL; free_priv: kfree(ctx->priv_ctx); ctx->priv_ctx = NULL; -- cgit v1.2.3-70-g09d2 From f4a8e43f1f0abc0e93ed5ee132288ee4142afde1 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:15 -0700 Subject: tls: Pass error code explicitly to tls_err_abort Pass EBADMSG explicitly to tls_err_abort. Receive path will pass additional codes - EMSGSIZE if framing is larger than max TLS record size, EINVAL if TLS version mismatch. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 6 +++--- net/tls/tls_sw.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 019e52db1817..6b44875a78e5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -174,9 +174,9 @@ static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) return tls_ctx->pending_open_record_frags; } -static inline void tls_err_abort(struct sock *sk) +static inline void tls_err_abort(struct sock *sk, int err) { - sk->sk_err = EBADMSG; + sk->sk_err = err; sk->sk_error_report(sk); } @@ -197,7 +197,7 @@ static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) - tls_err_abort(sk); + tls_err_abort(sk, EBADMSG); tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, ctx->iv_size); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 338d743bcc21..1c79d9ad1731 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -214,7 +214,7 @@ static int tls_push_record(struct sock *sk, int flags, /* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */ rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags); if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk); + tls_err_abort(sk, EBADMSG); tls_advance_record_sn(sk, &tls_ctx->tx); return rc; -- cgit v1.2.3-70-g09d2 From 583715853a25b4f2720b847e4fb8e37727299152 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:26 -0700 Subject: tls: Refactor variable names Several config variables are prefixed with tx, drop the prefix since these will be used for both tx and rx. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- net/tls/tls_main.c | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 6b44875a78e5..095b72283861 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -99,7 +99,7 @@ struct tls_context { void *priv_ctx; - u8 tx_conf:2; + u8 conf:2; struct cipher_context tx; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index c671560b832b..c405beeda765 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -52,7 +52,7 @@ enum { }; enum { - TLS_BASE_TX, + TLS_BASE, TLS_SW_TX, TLS_NUM_CONFIG, }; @@ -65,7 +65,7 @@ static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; + sk->sk_prot = &tls_prots[ip_ver][ctx->conf]; } int wait_on_pending_writer(struct sock *sk, long *timeo) @@ -238,7 +238,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; - if (ctx->tx_conf == TLS_BASE_TX) { + if (ctx->conf == TLS_BASE) { kfree(ctx); goto skip_tx_cleanup; } @@ -262,7 +262,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); - if (ctx->tx_conf == TLS_SW_TX) + if (ctx->conf == TLS_SW_TX) tls_sw_free_tx_resources(sk); skip_tx_cleanup: @@ -371,7 +371,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); int rc = 0; - int tx_conf; + int conf; if (!optval || (optlen < sizeof(*crypto_info))) { rc = -EINVAL; @@ -418,11 +418,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, /* currently SW is default, we will have ethtool in future */ rc = tls_set_sw_offload(sk, ctx); - tx_conf = TLS_SW_TX; + conf = TLS_SW_TX; if (rc) goto err_crypto_info; - ctx->tx_conf = tx_conf; + ctx->conf = conf; update_sk_prot(sk, ctx); ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; @@ -465,12 +465,12 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, static void build_protos(struct proto *prot, struct proto *base) { - prot[TLS_BASE_TX] = *base; - prot[TLS_BASE_TX].setsockopt = tls_setsockopt; - prot[TLS_BASE_TX].getsockopt = tls_getsockopt; - prot[TLS_BASE_TX].close = tls_sk_proto_close; + prot[TLS_BASE] = *base; + prot[TLS_BASE].setsockopt = tls_setsockopt; + prot[TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE].close = tls_sk_proto_close; - prot[TLS_SW_TX] = prot[TLS_BASE_TX]; + prot[TLS_SW_TX] = prot[TLS_BASE]; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; } @@ -513,7 +513,7 @@ static int tls_init(struct sock *sk) mutex_unlock(&tcpv6_prot_mutex); } - ctx->tx_conf = TLS_BASE_TX; + ctx->conf = TLS_BASE; update_sk_prot(sk, ctx); out: return rc; -- cgit v1.2.3-70-g09d2 From c46234ebb4d1eee5e09819f49169e51cfc6eb909 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 22 Mar 2018 10:10:35 -0700 Subject: tls: RX path for ktls Add rx path for tls software implementation. recvmsg, splice_read, and poll implemented. An additional sockopt TLS_RX is added, with the same interface as TLS_TX. Either TLX_RX or TLX_TX may be provided separately, or together (with two different setsockopt calls with appropriate keys). Control messages are passed via CMSG in a similar way to transmit. If no cmsg buffer is passed, then only application data records will be passed to userspace, and EIO is returned for other types of alerts. EBADMSG is passed for decryption errors, and EMSGSIZE is passed for framing too big, and EBADMSG for framing too small (matching openssl semantics). EINVAL is returned for TLS versions that do not match the original setsockopt call. All are unrecoverable. strparser is used to parse TLS framing. Decryption is done directly in to userspace buffers if they are large enough to support it, otherwise sk_cow_data is called (similar to ipsec), and buffers are decrypted in place and copied. splice_read always decrypts in place, since no buffers are provided to decrypt in to. sk_poll is overridden, and only returns POLLIN if a full TLS message is received. Otherwise we wait for strparser to finish reading a full frame. Actual decryption is only done during recvmsg or splice_read calls. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 27 ++- include/uapi/linux/tls.h | 2 + net/tls/Kconfig | 1 + net/tls/tls_main.c | 62 ++++- net/tls/tls_sw.c | 587 ++++++++++++++++++++++++++++++++++++++++++----- 5 files changed, 609 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 095b72283861..437a746300bf 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -40,6 +40,7 @@ #include #include #include +#include #include @@ -58,8 +59,18 @@ struct tls_sw_context { struct crypto_aead *aead_send; + struct crypto_aead *aead_recv; struct crypto_wait async_wait; + /* Receive context */ + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + unsigned int (*sk_poll)(struct file *file, struct socket *sock, + struct poll_table_struct *wait); + struct sk_buff *recv_pkt; + u8 control; + bool decrypted; + /* Sending context */ char aad_space[TLS_AAD_SPACE_SIZE]; @@ -96,12 +107,17 @@ struct tls_context { struct tls_crypto_info crypto_send; struct tls12_crypto_info_aes_gcm_128 crypto_send_aes_gcm_128; }; + union { + struct tls_crypto_info crypto_recv; + struct tls12_crypto_info_aes_gcm_128 crypto_recv_aes_gcm_128; + }; void *priv_ctx; u8 conf:2; struct cipher_context tx; + struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; @@ -128,12 +144,19 @@ int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx); +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); -void tls_sw_free_tx_resources(struct sock *sk); +void tls_sw_free_resources(struct sock *sk); +int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait); +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); void tls_icsk_clean_acked(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 293b2cdad88d..c6633e97eca4 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -38,6 +38,7 @@ /* TLS socket options */ #define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -59,6 +60,7 @@ #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 #define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 struct tls_crypto_info { __u16 version; diff --git a/net/tls/Kconfig b/net/tls/Kconfig index eb583038c67e..89b8745a986f 100644 --- a/net/tls/Kconfig +++ b/net/tls/Kconfig @@ -7,6 +7,7 @@ config TLS select CRYPTO select CRYPTO_AES select CRYPTO_GCM + select STREAM_PARSER default n ---help--- Enable kernel support for TLS protocol. This allows symmetric diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index c405beeda765..6f5c1146da4a 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -54,12 +54,15 @@ enum { enum { TLS_BASE, TLS_SW_TX, + TLS_SW_RX, + TLS_SW_RXTX, TLS_NUM_CONFIG, }; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; +static struct proto_ops tls_sw_proto_ops; static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) { @@ -261,9 +264,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) kfree(ctx->tx.rec_seq); kfree(ctx->tx.iv); + kfree(ctx->rx.rec_seq); + kfree(ctx->rx.iv); - if (ctx->conf == TLS_SW_TX) - tls_sw_free_tx_resources(sk); + if (ctx->conf == TLS_SW_TX || + ctx->conf == TLS_SW_RX || + ctx->conf == TLS_SW_RXTX) { + tls_sw_free_resources(sk); + } skip_tx_cleanup: release_sock(sk); @@ -365,8 +373,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname, return do_tls_getsockopt(sk, optname, optval, optlen); } -static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, - unsigned int optlen) +static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, + unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); @@ -378,7 +386,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, goto out; } - crypto_info = &ctx->crypto_send; + if (tx) + crypto_info = &ctx->crypto_send; + else + crypto_info = &ctx->crypto_recv; + /* Currently we don't support set crypto info more than one time */ if (TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; @@ -417,15 +429,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval, } /* currently SW is default, we will have ethtool in future */ - rc = tls_set_sw_offload(sk, ctx); - conf = TLS_SW_TX; + if (tx) { + rc = tls_set_sw_offload(sk, ctx, 1); + if (ctx->conf == TLS_SW_RX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_TX; + } else { + rc = tls_set_sw_offload(sk, ctx, 0); + if (ctx->conf == TLS_SW_TX) + conf = TLS_SW_RXTX; + else + conf = TLS_SW_RX; + } + if (rc) goto err_crypto_info; ctx->conf = conf; update_sk_prot(sk, ctx); - ctx->sk_write_space = sk->sk_write_space; - sk->sk_write_space = tls_write_space; + if (tx) { + ctx->sk_write_space = sk->sk_write_space; + sk->sk_write_space = tls_write_space; + } else { + sk->sk_socket->ops = &tls_sw_proto_ops; + } goto out; err_crypto_info: @@ -441,8 +469,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname, switch (optname) { case TLS_TX: + case TLS_RX: lock_sock(sk); - rc = do_tls_setsockopt_tx(sk, optval, optlen); + rc = do_tls_setsockopt_conf(sk, optval, optlen, + optname == TLS_TX); release_sock(sk); break; default: @@ -473,6 +503,14 @@ static void build_protos(struct proto *prot, struct proto *base) prot[TLS_SW_TX] = prot[TLS_BASE]; prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; prot[TLS_SW_TX].sendpage = tls_sw_sendpage; + + prot[TLS_SW_RX] = prot[TLS_BASE]; + prot[TLS_SW_RX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RX].close = tls_sk_proto_close; + + prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; + prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; + prot[TLS_SW_RXTX].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) @@ -531,6 +569,10 @@ static int __init tls_register(void) { build_protos(tls_prots[TLSV4], &tcp_prot); + tls_sw_proto_ops = inet_stream_ops; + tls_sw_proto_ops.poll = tls_sw_poll; + tls_sw_proto_ops.splice_read = tls_sw_splice_read; + tcp_register_ulp(&tcp_tls_ulp_ops); return 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 1c79d9ad1731..4dc766b03f00 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -34,11 +34,60 @@ * SOFTWARE. */ +#include #include #include +#include #include +static int tls_do_decryption(struct sock *sk, + struct scatterlist *sgin, + struct scatterlist *sgout, + char *iv_recv, + size_t data_len, + struct sk_buff *skb, + gfp_t flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + struct aead_request *aead_req; + + int ret; + unsigned int req_size = sizeof(struct aead_request) + + crypto_aead_reqsize(ctx->aead_recv); + + aead_req = kzalloc(req_size, flags); + if (!aead_req) + return -ENOMEM; + + aead_request_set_tfm(aead_req, ctx->aead_recv); + aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE); + aead_request_set_crypt(aead_req, sgin, sgout, + data_len + tls_ctx->rx.tag_size, + (u8 *)iv_recv); + aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &ctx->async_wait); + + ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait); + + if (ret < 0) + goto out; + + rxm->offset += tls_ctx->rx.prepend_size; + rxm->full_len -= tls_ctx->rx.overhead_size; + tls_advance_record_sn(sk, &tls_ctx->rx); + + ctx->decrypted = true; + + ctx->saved_data_ready(sk); + +out: + kfree(aead_req); + return ret; +} + static void trim_sg(struct sock *sk, struct scatterlist *sg, int *sg_num_elem, unsigned int *sg_size, int target_size) { @@ -581,13 +630,404 @@ sendpage_end: return ret; } -void tls_sw_free_tx_resources(struct sock *sk) +static struct sk_buff *tls_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->recv_pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static int decrypt_skb(struct sock *sk, struct sk_buff *skb, + struct scatterlist *sgout) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size]; + struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2]; + struct scatterlist *sgin = &sgin_arr[0]; + struct strp_msg *rxm = strp_msg(skb); + int ret, nsg = ARRAY_SIZE(sgin_arr); + char aad_recv[TLS_AAD_SPACE_SIZE]; + struct sk_buff *unused; + + ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE, + iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + tls_ctx->rx.iv_size); + if (ret < 0) + return ret; + + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + if (!sgout) { + nsg = skb_cow_data(skb, 0, &unused) + 1; + sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation); + if (!sgout) + sgout = sgin; + } + + sg_init_table(sgin, nsg); + sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv)); + + nsg = skb_to_sgvec(skb, &sgin[1], + rxm->offset + tls_ctx->rx.prepend_size, + rxm->full_len - tls_ctx->rx.prepend_size); + + tls_make_aad(aad_recv, + rxm->full_len - tls_ctx->rx.overhead_size, + tls_ctx->rx.rec_seq, + tls_ctx->rx.rec_seq_size, + ctx->control); + + ret = tls_do_decryption(sk, sgin, sgout, iv, + rxm->full_len - tls_ctx->rx.overhead_size, + skb, sk->sk_allocation); + + if (sgin != &sgin_arr[0]) + kfree(sgin); + + return ret; +} + +static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + + return false; + } + + /* Finished with message */ + ctx->recv_pkt = NULL; + kfree_skb(skb); + strp_unpause(&ctx->strp); + + return true; +} + +int tls_sw_recvmsg(struct sock *sk, + struct msghdr *msg, + size_t len, + int nonblock, + int flags, + int *addr_len) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + unsigned char control; + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + bool cmsg = false; + int err = 0; + long timeo; + + flags |= nonblock; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + do { + bool zc = false; + int chunk = 0; + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + rxm = strp_msg(skb); + if (!cmsg) { + int cerr; + + cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE, + sizeof(ctx->control), &ctx->control); + cmsg = true; + control = ctx->control; + if (ctx->control != TLS_RECORD_TYPE_DATA) { + if (cerr || msg->msg_flags & MSG_CTRUNC) { + err = -EIO; + goto recv_end; + } + } + } else if (control != ctx->control) { + goto recv_end; + } + + if (!ctx->decrypted) { + int page_count; + int to_copy; + + page_count = iov_iter_npages(&msg->msg_iter, + MAX_SKB_FRAGS); + to_copy = rxm->full_len - tls_ctx->rx.overhead_size; + if (to_copy <= len && page_count < MAX_SKB_FRAGS && + likely(!(flags & MSG_PEEK))) { + struct scatterlist sgin[MAX_SKB_FRAGS + 1]; + char unused[21]; + int pages = 0; + + zc = true; + sg_init_table(sgin, MAX_SKB_FRAGS + 1); + sg_set_buf(&sgin[0], unused, 13); + + err = zerocopy_from_iter(sk, &msg->msg_iter, + to_copy, &pages, + &chunk, &sgin[1], + MAX_SKB_FRAGS, false); + if (err < 0) + goto fallback_to_reg_recv; + + err = decrypt_skb(sk, skb, sgin); + for (; pages > 0; pages--) + put_page(sg_page(&sgin[pages])); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } else { +fallback_to_reg_recv: + err = decrypt_skb(sk, skb, NULL); + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto recv_end; + } + } + ctx->decrypted = true; + } + + if (!zc) { + chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + } + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) { + u8 control = ctx->control; + + if (tls_sw_advance_skb(sk, skb, chunk)) { + /* Return full control message to + * userspace before trying to parse + * another message type + */ + msg->msg_flags |= MSG_EOR; + if (control != TLS_RECORD_TYPE_DATA) + goto recv_end; + } + } + } while (len); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct tls_context *tls_ctx = tls_get_ctx(sock->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm = NULL; + struct sock *sk = sock->sk; + struct sk_buff *skb; + ssize_t copied = 0; + int err = 0; + long timeo; + int chunk; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + skb = tls_wait_data(sk, flags, timeo, &err); + if (!skb) + goto splice_read_end; + + /* splice does not support reading control messages */ + if (ctx->control != TLS_RECORD_TYPE_DATA) { + err = -ENOTSUPP; + goto splice_read_end; + } + + if (!ctx->decrypted) { + err = decrypt_skb(sk, skb, NULL); + + if (err < 0) { + tls_err_abort(sk, EBADMSG); + goto splice_read_end; + } + ctx->decrypted = true; + } + rxm = strp_msg(skb); + + chunk = min_t(unsigned int, rxm->full_len, len); + copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags); + if (copied < 0) + goto splice_read_end; + + if (likely(!(flags & MSG_PEEK))) + tls_sw_advance_skb(sk, skb, copied); + +splice_read_end: + release_sock(sk); + return copied ? : err; +} + +unsigned int tls_sw_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int ret; + struct sock *sk = sock->sk; + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + /* Grab POLLOUT and POLLHUP from the underlying socket */ + ret = ctx->sk_poll(file, sock, wait); + + /* Clear POLLIN bits, and set based on recv_pkt */ + ret &= ~(POLLIN | POLLRDNORM); + if (ctx->recv_pkt) + ret |= POLLIN | POLLRDNORM; + + return ret; +} + +static int tls_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + char header[tls_ctx->rx.prepend_size]; + struct strp_msg *rxm = strp_msg(skb); + size_t cipher_overhead; + size_t data_len = 0; + int ret; + + /* Verify that we have a full TLS header, or wait for more data */ + if (rxm->offset + tls_ctx->rx.prepend_size > skb->len) + return 0; + + /* Linearize header to local buffer */ + ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size); + + if (ret < 0) + goto read_failure; + + ctx->control = header[0]; + + data_len = ((header[4] & 0xFF) | (header[3] << 8)); + + cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size; + + if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) { + ret = -EMSGSIZE; + goto read_failure; + } + if (data_len < cipher_overhead) { + ret = -EBADMSG; + goto read_failure; + } + + if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) || + header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) { + ret = -EINVAL; + goto read_failure; + } + + return data_len + TLS_HEADER_SIZE; + +read_failure: + tls_err_abort(strp->sk, ret); + + return ret; +} + +static void tls_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tls_context *tls_ctx = tls_get_ctx(strp->sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + struct strp_msg *rxm; + + rxm = strp_msg(skb); + + ctx->decrypted = false; + + ctx->recv_pkt = skb; + strp_pause(strp); + + strp->sk->sk_state_change(strp->sk); +} + +static void tls_data_ready(struct sock *sk) +{ + struct tls_context *tls_ctx = tls_get_ctx(sk); + struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); + + strp_data_ready(&ctx->strp); +} + +void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); if (ctx->aead_send) crypto_free_aead(ctx->aead_send); + if (ctx->aead_recv) { + if (ctx->recv_pkt) { + kfree_skb(ctx->recv_pkt); + ctx->recv_pkt = NULL; + } + crypto_free_aead(ctx->aead_recv); + strp_stop(&ctx->strp); + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = ctx->saved_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + release_sock(sk); + strp_done(&ctx->strp); + lock_sock(sk); + } tls_free_both_sg(sk); @@ -595,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk) kfree(tls_ctx); } -int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) +int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; struct tls_sw_context *sw_ctx; + struct cipher_context *cctx; + struct crypto_aead **aead; + struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; char *iv, *rec_seq; int rc = 0; @@ -610,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto out; } - if (ctx->priv_ctx) { - rc = -EEXIST; - goto out; - } - - sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); - if (!sw_ctx) { - rc = -ENOMEM; - goto out; + if (!ctx->priv_ctx) { + sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL); + if (!sw_ctx) { + rc = -ENOMEM; + goto out; + } + crypto_init_wait(&sw_ctx->async_wait); + } else { + sw_ctx = ctx->priv_ctx; } - crypto_init_wait(&sw_ctx->async_wait); - ctx->priv_ctx = (struct tls_offload_context *)sw_ctx; - crypto_info = &ctx->crypto_send; + if (tx) { + crypto_info = &ctx->crypto_send; + cctx = &ctx->tx; + aead = &sw_ctx->aead_send; + } else { + crypto_info = &ctx->crypto_recv; + cctx = &ctx->rx; + aead = &sw_ctx->aead_recv; + } + switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: { nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; @@ -644,48 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) goto free_priv; } - ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size; - ctx->tx.tag_size = tag_size; - ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size; - ctx->tx.iv_size = iv_size; - ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); - if (!ctx->tx.iv) { + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; + cctx->tag_size = tag_size; + cctx->overhead_size = cctx->prepend_size + cctx->tag_size; + cctx->iv_size = iv_size; + cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + GFP_KERNEL); + if (!cctx->iv) { rc = -ENOMEM; goto free_priv; } - memcpy(ctx->tx.iv, gcm_128_info->salt, - TLS_CIPHER_AES_GCM_128_SALT_SIZE); - memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); - ctx->tx.rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); - if (!ctx->tx.rec_seq) { + memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + cctx->rec_seq_size = rec_seq_size; + cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL); + if (!cctx->rec_seq) { rc = -ENOMEM; goto free_iv; } - memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size); - - sg_init_table(sw_ctx->sg_encrypted_data, - ARRAY_SIZE(sw_ctx->sg_encrypted_data)); - sg_init_table(sw_ctx->sg_plaintext_data, - ARRAY_SIZE(sw_ctx->sg_plaintext_data)); - - sg_init_table(sw_ctx->sg_aead_in, 2); - sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_in[1]); - sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); - sg_init_table(sw_ctx->sg_aead_out, 2); - sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, - sizeof(sw_ctx->aad_space)); - sg_unmark_end(&sw_ctx->sg_aead_out[1]); - sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); - - if (!sw_ctx->aead_send) { - sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0); - if (IS_ERR(sw_ctx->aead_send)) { - rc = PTR_ERR(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + memcpy(cctx->rec_seq, rec_seq, rec_seq_size); + + if (tx) { + sg_init_table(sw_ctx->sg_encrypted_data, + ARRAY_SIZE(sw_ctx->sg_encrypted_data)); + sg_init_table(sw_ctx->sg_plaintext_data, + ARRAY_SIZE(sw_ctx->sg_plaintext_data)); + + sg_init_table(sw_ctx->sg_aead_in, 2); + sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_in[1]); + sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data); + sg_init_table(sw_ctx->sg_aead_out, 2); + sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space, + sizeof(sw_ctx->aad_space)); + sg_unmark_end(&sw_ctx->sg_aead_out[1]); + sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data); + } + + if (!*aead) { + *aead = crypto_alloc_aead("gcm(aes)", 0, 0); + if (IS_ERR(*aead)) { + rc = PTR_ERR(*aead); + *aead = NULL; goto free_rec_seq; } } @@ -694,21 +1145,41 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx) memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE); - rc = crypto_aead_setkey(sw_ctx->aead_send, keyval, + rc = crypto_aead_setkey(*aead, keyval, TLS_CIPHER_AES_GCM_128_KEY_SIZE); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size); - if (!rc) - return 0; + rc = crypto_aead_setauthsize(*aead, cctx->tag_size); + if (rc) + goto free_aead; + + if (!tx) { + /* Set up strparser */ + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = tls_queue; + cb.parse_msg = tls_read_size; + + strp_init(&sw_ctx->strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + sw_ctx->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = tls_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + sw_ctx->sk_poll = sk->sk_socket->ops->poll; + + strp_check_rcv(&sw_ctx->strp); + } + + goto out; free_aead: - crypto_free_aead(sw_ctx->aead_send); - sw_ctx->aead_send = NULL; + crypto_free_aead(*aead); + *aead = NULL; free_rec_seq: - kfree(ctx->tx.rec_seq); - ctx->tx.rec_seq = NULL; + kfree(cctx->rec_seq); + cctx->rec_seq = NULL; free_iv: kfree(ctx->tx.iv); ctx->tx.iv = NULL; -- cgit v1.2.3-70-g09d2 From 82792a070b168a9659c78e6a19efe5b6d1a7ce73 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 23 Mar 2018 18:27:06 +0200 Subject: net: bridge: fix direct access to bridge vlan_enabled and use helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to use br_vlan_enabled() helper otherwise we'll break builds without bridge vlans: net/bridge//br_if.c: In function ‘br_mtu’: net/bridge//br_if.c:458:8: error: ‘const struct net_bridge’ has no member named ‘vlan_enabled’ if (br->vlan_enabled) ^ net/bridge//br_if.c:462:1: warning: control reaches end of non-void function [-Wreturn-type] } ^ scripts/Makefile.build:324: recipe for target 'net/bridge//br_if.o' failed Fixes: 419d14af9e07 ("bridge: Allow max MTU when multiple VLANs present") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 48dc4d2e2be3..87b2afd455c7 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -455,7 +455,7 @@ static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int)) int br_mtu(const struct net_bridge *br) { - if (br->vlan_enabled) + if (br_vlan_enabled(br->dev)) return __br_mtu(br, max_mtu); else return __br_mtu(br, min_mtu); -- cgit v1.2.3-70-g09d2 From fc18999ed2a217f249e9880363b28a89c839737e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 21:34:46 +0300 Subject: net: Convert udp_sysctl_ops These pernet_operations just initialize udp4 defaults. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 908fc02fb4f8..c6dc019bc64b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2842,7 +2842,8 @@ static int __net_init udp_sysctl_init(struct net *net) } static struct pernet_operations __net_initdata udp_sysctl_ops = { - .init = udp_sysctl_init, + .init = udp_sysctl_init, + .async = true, }; void __init udp_init(void) -- cgit v1.2.3-70-g09d2 From b2864fbdc5ab6bb4750b00580b67070bd0ab3762 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 22 Mar 2018 21:34:55 +0300 Subject: net: Convert rxrpc_net_ops These pernet_operations modifies rxrpc_net_id-pointed per-net entities. There is external link to AF_RXRPC in fs/afs/Kconfig, but it seems there is no other pernet_operations interested in that per-net entities. Signed-off-by: Kirill Tkhai Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/net_ns.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f18c9248e0d4..5fd939dabf41 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -106,4 +106,5 @@ struct pernet_operations rxrpc_net_ops = { .exit = rxrpc_exit_net, .id = &rxrpc_net_id, .size = sizeof(struct rxrpc_net), + .async = true, }; -- cgit v1.2.3-70-g09d2 From cb30a63384bc91d5da06e1cede1115f666a29271 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:45 +0100 Subject: tipc: refactor function tipc_enable_bearer() As a preparation for the next commits we try to reduce the footprint of the function tipc_enable_bearer(), while hopefully making is simpler to follow. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 136 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index f3d2e83313e1..e18cb271b005 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -230,88 +230,90 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) * tipc_enable_bearer - enable bearer with the given name */ static int tipc_enable_bearer(struct net *net, const char *name, - u32 disc_domain, u32 priority, + u32 disc_domain, u32 prio, struct nlattr *attr[]) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer_names b_names; + u32 self = tipc_own_addr(net); + int with_this_prio = 1; struct tipc_bearer *b; struct tipc_media *m; - struct tipc_bearer_names b_names; struct sk_buff *skb; char addr_string[16]; - u32 bearer_id; - u32 with_this_prio; - u32 i; + int bearer_id = 0; int res = -EINVAL; + char *errstr = ""; - if (!tn->own_addr) { - pr_warn("Bearer <%s> rejected, not supported in standalone mode\n", - name); - return -ENOPROTOOPT; + if (!self) { + errstr = "not supported in standalone mode"; + res = -ENOPROTOOPT; + goto rejected; } + if (!bearer_name_validate(name, &b_names)) { - pr_warn("Bearer <%s> rejected, illegal name\n", name); - return -EINVAL; + errstr = "illegal name"; + goto rejected; } - if (tipc_addr_domain_valid(disc_domain) && - (disc_domain != tn->own_addr)) { - if (tipc_in_scope(disc_domain, tn->own_addr)) { - disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; - res = 0; /* accept any node in own cluster */ - } else if (in_own_cluster_exact(net, disc_domain)) - res = 0; /* accept specified node in own cluster */ + + if (tipc_addr_domain_valid(disc_domain) && disc_domain != self) { + if (tipc_in_scope(disc_domain, self)) { + /* Accept any node in own cluster */ + disc_domain = self & TIPC_ZONE_CLUSTER_MASK; + res = 0; + } else if (in_own_cluster_exact(net, disc_domain)) { + /* Accept specified node in own cluster */ + res = 0; + } } if (res) { - pr_warn("Bearer <%s> rejected, illegal discovery domain\n", - name); - return -EINVAL; + errstr = "illegal discovery domain"; + goto rejected; } - if ((priority > TIPC_MAX_LINK_PRI) && - (priority != TIPC_MEDIA_LINK_PRI)) { - pr_warn("Bearer <%s> rejected, illegal priority\n", name); - return -EINVAL; + + if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) { + errstr = "illegal priority"; + goto rejected; } m = tipc_media_find(b_names.media_name); if (!m) { - pr_warn("Bearer <%s> rejected, media <%s> not registered\n", - name, b_names.media_name); - return -EINVAL; + errstr = "media not registered"; + goto rejected; } - if (priority == TIPC_MEDIA_LINK_PRI) - priority = m->priority; + if (prio == TIPC_MEDIA_LINK_PRI) + prio = m->priority; -restart: - bearer_id = MAX_BEARERS; - with_this_prio = 1; - for (i = MAX_BEARERS; i-- != 0; ) { - b = rtnl_dereference(tn->bearer_list[i]); - if (!b) { - bearer_id = i; - continue; - } + /* Check new bearer vs existing ones and find free bearer id if any */ + while (bearer_id < MAX_BEARERS) { + b = rtnl_dereference(tn->bearer_list[bearer_id]); + if (!b) + break; if (!strcmp(name, b->name)) { - pr_warn("Bearer <%s> rejected, already enabled\n", - name); - return -EINVAL; + errstr = "already enabled"; + goto rejected; } - if ((b->priority == priority) && - (++with_this_prio > 2)) { - if (priority-- == 0) { - pr_warn("Bearer <%s> rejected, duplicate priority\n", - name); - return -EINVAL; - } - pr_warn("Bearer <%s> priority adjustment required %u->%u\n", - name, priority + 1, priority); - goto restart; + bearer_id++; + if (b->priority != prio) + continue; + if (++with_this_prio <= 2) + continue; + pr_warn("Bearer <%s>: already 2 bearers with priority %u\n", + name, prio); + if (prio == TIPC_MIN_LINK_PRI) { + errstr = "cannot adjust to lower"; + goto rejected; } + pr_warn("Bearer <%s>: trying with adjusted priority\n", name); + prio--; + bearer_id = 0; + with_this_prio = 1; } + if (bearer_id >= MAX_BEARERS) { - pr_warn("Bearer <%s> rejected, bearer limit reached (%u)\n", - name, MAX_BEARERS); - return -EINVAL; + errstr = "max 3 bearers permitted"; + goto rejected; } b = kzalloc(sizeof(*b), GFP_ATOMIC); @@ -322,10 +324,9 @@ restart: b->media = m; res = m->enable_media(net, b, attr); if (res) { - pr_warn("Bearer <%s> rejected, enable failure (%d)\n", - name, -res); kfree(b); - return -EINVAL; + errstr = "failed to enable media"; + goto rejected; } b->identity = bearer_id; @@ -333,15 +334,15 @@ restart: b->window = m->window; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; - b->priority = priority; + b->priority = prio; test_and_set_bit_lock(0, &b->up); res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); - pr_warn("Bearer <%s> rejected, discovery object creation failed\n", - name); - return -EINVAL; + kfree(b); + errstr = "failed to create discoverer"; + goto rejected; } rcu_assign_pointer(tn->bearer_list[bearer_id], b); @@ -353,9 +354,12 @@ restart: return -ENOMEM; } - pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", - name, - tipc_addr_string_fill(addr_string, disc_domain), priority); + tipc_addr_string_fill(addr_string, disc_domain); + pr_info("Enabled bearer <%s>, discovery scope %s, priority %u\n", + name, addr_string, prio); + return res; +rejected: + pr_warn("Bearer <%s> rejected, %s\n", name, errstr); return res; } -- cgit v1.2.3-70-g09d2 From b39e465e56ec38ca64b4c0affeb6411eb0ed7267 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:46 +0100 Subject: tipc: some cleanups in the file discover.c To facilitate the coming changes in the neighbor discovery functionality we make some renaming and refactoring of that code. The functional changes in this commit are trivial, e.g., that we move the message sending call in tipc_disc_timeout() outside the spinlock protected region. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 8 +- net/tipc/bearer.h | 2 +- net/tipc/discover.c | 303 +++++++++++++++++++++++++--------------------------- net/tipc/discover.h | 8 +- 4 files changed, 155 insertions(+), 166 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index e18cb271b005..76340b9e4851 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); if (b) - tipc_disc_add_dest(b->link_req); + tipc_disc_add_dest(b->disc); rcu_read_unlock(); } @@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest) rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); if (b) - tipc_disc_remove_dest(b->link_req); + tipc_disc_remove_dest(b->disc); rcu_read_unlock(); } @@ -389,8 +389,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) tipc_node_delete_links(net, bearer_id); b->media->disable_media(b); RCU_INIT_POINTER(b->media_ptr, NULL); - if (b->link_req) - tipc_disc_delete(b->link_req); + if (b->disc) + tipc_disc_delete(b->disc); RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); tipc_mon_delete(net, bearer_id); diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index a53613d95bc9..6efcee63a381 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -159,7 +159,7 @@ struct tipc_bearer { u32 tolerance; u32 domain; u32 identity; - struct tipc_link_req *link_req; + struct tipc_discoverer *disc; char net_plane; unsigned long up; }; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 92e4828c6b09..09f75558d353 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -39,34 +39,34 @@ #include "discover.h" /* min delay during bearer start up */ -#define TIPC_LINK_REQ_INIT msecs_to_jiffies(125) +#define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ -#define TIPC_LINK_REQ_FAST msecs_to_jiffies(1000) +#define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ -#define TIPC_LINK_REQ_SLOW msecs_to_jiffies(60000) +#define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ -#define TIPC_LINK_REQ_INACTIVE 0xffffffff +#define TIPC_DISC_INACTIVE 0xffffffff /** - * struct tipc_link_req - information about an ongoing link setup request + * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests - * @buf: request message to be (repeatedly) sent + * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ -struct tipc_link_req { +struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; - struct sk_buff *buf; + struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; @@ -77,22 +77,35 @@ struct tipc_link_req { * @type: message type (request or response) * @b: ptr to bearer issuing message */ -static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type, - struct tipc_bearer *b) +static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, + u32 mtyp, struct tipc_bearer *b) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_msg *msg; + struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; + struct tipc_msg *hdr; - msg = buf_msg(buf); - tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type, + hdr = buf_msg(skb); + tipc_msg_init(tn->own_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); - msg_set_non_seq(msg, 1); - msg_set_node_sig(msg, tn->random); - msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES); - msg_set_dest_domain(msg, dest_domain); - msg_set_bc_netid(msg, tn->net_id); - b->media->addr2msg(msg_media_addr(msg), &b->addr); + msg_set_non_seq(hdr, 1); + msg_set_node_sig(hdr, tn->random); + msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); + msg_set_dest_domain(hdr, dest_domain); + msg_set_bc_netid(hdr, tn->net_id); + b->media->addr2msg(msg_media_addr(hdr), &b->addr); +} + +static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, + struct tipc_media_addr *maddr, + struct tipc_bearer *b) +{ + struct sk_buff *skb; + + skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); + if (!skb) + return; + tipc_disc_init_msg(net, skb, mtyp, b); + tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** @@ -116,149 +129,123 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, /** * tipc_disc_rcv - handle incoming discovery message (request or response) - * @net: the applicable net namespace - * @buf: buffer containing message - * @bearer: bearer that message arrived on + * @net: applicable net namespace + * @skb: buffer containing message + * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, - struct tipc_bearer *bearer) + struct tipc_bearer *b) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_media_addr maddr; - struct sk_buff *rskb; + struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); - u32 ddom = msg_dest_domain(hdr); - u32 onode = msg_prevnode(hdr); + u16 caps = msg_node_capabilities(hdr); + u32 signature = msg_node_sig(hdr); + u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); + u32 self = tipc_own_addr(net); + struct tipc_media_addr maddr; + u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); - u32 signature = msg_node_sig(hdr); - u16 caps = msg_node_capabilities(hdr); - bool respond = false; bool dupl_addr = false; + bool respond = false; int err; - err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); + err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); - if (err) + if (err || maddr.broadcast) { + pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; - - /* Ensure message from node is valid and communication is permitted */ - if (net_id != tn->net_id) + } + /* Ignore discovery messages from own node */ + if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; - if (maddr.broadcast) + if (net_id != tn->net_id) return; - if (!tipc_addr_domain_valid(ddom)) + if (!tipc_addr_domain_valid(dst)) return; - if (!tipc_addr_node_valid(onode)) + if (!tipc_addr_node_valid(src)) return; - - if (in_own_node(net, onode)) { - if (memcmp(&maddr, &bearer->addr, sizeof(maddr))) - disc_dupl_alert(bearer, tn->own_addr, &maddr); + if (in_own_node(net, src)) { + disc_dupl_alert(b, self, &maddr); return; } - if (!tipc_in_scope(ddom, tn->own_addr)) + if (!tipc_in_scope(dst, self)) return; - if (!tipc_in_scope(bearer->domain, onode)) + if (!tipc_in_scope(b->domain, src)) return; - - tipc_node_check_dest(net, onode, bearer, caps, signature, + tipc_node_check_dest(net, src, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) - disc_dupl_alert(bearer, onode, &maddr); - - /* Send response, if necessary */ - if (respond && (mtyp == DSC_REQ_MSG)) { - rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); - if (!rskb) - return; - tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); - tipc_bearer_xmit_skb(net, bearer->identity, rskb, &maddr); - } + disc_dupl_alert(b, src, &maddr); + if (!respond) + return; + if (mtyp != DSC_REQ_MSG) + return; + tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, &maddr, b); } -/** - * disc_update - update frequency of periodic link setup requests - * @req: ptr to link request structure - * - * Reinitiates discovery process if discovery object has no associated nodes - * and is either not currently searching or is searching at a slow rate +/* tipc_disc_add_dest - increment set of discovered nodes */ -static void disc_update(struct tipc_link_req *req) +void tipc_disc_add_dest(struct tipc_discoverer *d) { - if (!req->num_nodes) { - if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) || - (req->timer_intv > TIPC_LINK_REQ_FAST)) { - req->timer_intv = TIPC_LINK_REQ_INIT; - mod_timer(&req->timer, jiffies + req->timer_intv); - } - } + spin_lock_bh(&d->lock); + d->num_nodes++; + spin_unlock_bh(&d->lock); } -/** - * tipc_disc_add_dest - increment set of discovered nodes - * @req: ptr to link request structure +/* tipc_disc_remove_dest - decrement set of discovered nodes */ -void tipc_disc_add_dest(struct tipc_link_req *req) +void tipc_disc_remove_dest(struct tipc_discoverer *d) { - spin_lock_bh(&req->lock); - req->num_nodes++; - spin_unlock_bh(&req->lock); -} + int intv, num; -/** - * tipc_disc_remove_dest - decrement set of discovered nodes - * @req: ptr to link request structure - */ -void tipc_disc_remove_dest(struct tipc_link_req *req) -{ - spin_lock_bh(&req->lock); - req->num_nodes--; - disc_update(req); - spin_unlock_bh(&req->lock); + spin_lock_bh(&d->lock); + d->num_nodes--; + num = d->num_nodes; + intv = d->timer_intv; + if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { + d->timer_intv = TIPC_DISC_INIT; + mod_timer(&d->timer, jiffies + d->timer_intv); + } + spin_unlock_bh(&d->lock); } -/** - * disc_timeout - send a periodic link setup request - * @data: ptr to link request structure - * +/* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. + * - Keep doubling time between sent request until limit is reached; + * - Hold at fast polling rate if we don't have any associated nodes + * - Otherwise hold at slow polling rate */ -static void disc_timeout(struct timer_list *t) +static void tipc_disc_timeout(struct timer_list *t) { - struct tipc_link_req *req = from_timer(req, t, timer); - struct sk_buff *skb; - int max_delay; + struct tipc_discoverer *d = from_timer(d, t, timer); + struct tipc_media_addr maddr; + struct sk_buff *skb = NULL; + struct net *net; + u32 bearer_id; - spin_lock_bh(&req->lock); + spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ - if (tipc_node(req->domain) && req->num_nodes) { - req->timer_intv = TIPC_LINK_REQ_INACTIVE; + if (tipc_node(d->domain) && d->num_nodes) { + d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } - - /* - * Send discovery message, then update discovery timer - * - * Keep doubling time between requests until limit is reached; - * hold at fast polling rate if don't have any associated nodes, - * otherwise hold at slow polling rate - */ - skb = skb_clone(req->buf, GFP_ATOMIC); - if (skb) - tipc_bearer_xmit_skb(req->net, req->bearer_id, skb, &req->dest); - req->timer_intv *= 2; - if (req->num_nodes) - max_delay = TIPC_LINK_REQ_SLOW; - else - max_delay = TIPC_LINK_REQ_FAST; - if (req->timer_intv > max_delay) - req->timer_intv = max_delay; - - mod_timer(&req->timer, jiffies + req->timer_intv); + /* Adjust timeout interval according to discovery phase */ + d->timer_intv *= 2; + if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) + d->timer_intv = TIPC_DISC_SLOW; + else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) + d->timer_intv = TIPC_DISC_FAST; + mod_timer(&d->timer, jiffies + d->timer_intv); + memcpy(&maddr, &d->dest, sizeof(maddr)); + skb = skb_clone(d->skb, GFP_ATOMIC); + net = d->net; + bearer_id = d->bearer_id; exit: - spin_unlock_bh(&req->lock); + spin_unlock_bh(&d->lock); + if (skb) + tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** @@ -273,41 +260,41 @@ exit: int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { - struct tipc_link_req *req; + struct tipc_discoverer *d; - req = kmalloc(sizeof(*req), GFP_ATOMIC); - if (!req) + d = kmalloc(sizeof(*d), GFP_ATOMIC); + if (!d) return -ENOMEM; - req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); - if (!req->buf) { - kfree(req); + d->skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); + if (!d->skb) { + kfree(d); return -ENOMEM; } - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); - memcpy(&req->dest, dest, sizeof(*dest)); - req->net = net; - req->bearer_id = b->identity; - req->domain = b->domain; - req->num_nodes = 0; - req->timer_intv = TIPC_LINK_REQ_INIT; - spin_lock_init(&req->lock); - timer_setup(&req->timer, disc_timeout, 0); - mod_timer(&req->timer, jiffies + req->timer_intv); - b->link_req = req; - *skb = skb_clone(req->buf, GFP_ATOMIC); + tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); + memcpy(&d->dest, dest, sizeof(*dest)); + d->net = net; + d->bearer_id = b->identity; + d->domain = b->domain; + d->num_nodes = 0; + d->timer_intv = TIPC_DISC_INIT; + spin_lock_init(&d->lock); + timer_setup(&d->timer, tipc_disc_timeout, 0); + mod_timer(&d->timer, jiffies + d->timer_intv); + b->disc = d; + *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests - * @req: ptr to link request structure + * @d: ptr to link duest structure */ -void tipc_disc_delete(struct tipc_link_req *req) +void tipc_disc_delete(struct tipc_discoverer *d) { - del_timer_sync(&req->timer); - kfree_skb(req->buf); - kfree(req); + del_timer_sync(&d->timer); + kfree_skb(d->skb); + kfree(d); } /** @@ -318,19 +305,21 @@ void tipc_disc_delete(struct tipc_link_req *req) */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { - struct tipc_link_req *req = b->link_req; + struct tipc_discoverer *d = b->disc; + struct tipc_media_addr maddr; struct sk_buff *skb; - spin_lock_bh(&req->lock); - tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b); - req->net = net; - req->bearer_id = b->identity; - req->domain = b->domain; - req->num_nodes = 0; - req->timer_intv = TIPC_LINK_REQ_INIT; - mod_timer(&req->timer, jiffies + req->timer_intv); - skb = skb_clone(req->buf, GFP_ATOMIC); + spin_lock_bh(&d->lock); + tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); + d->net = net; + d->bearer_id = b->identity; + d->domain = b->domain; + d->num_nodes = 0; + d->timer_intv = TIPC_DISC_INIT; + memcpy(&maddr, &d->dest, sizeof(maddr)); + mod_timer(&d->timer, jiffies + d->timer_intv); + skb = skb_clone(d->skb, GFP_ATOMIC); + spin_unlock_bh(&d->lock); if (skb) - tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); - spin_unlock_bh(&req->lock); + tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); } diff --git a/net/tipc/discover.h b/net/tipc/discover.h index b80a335389c0..521d96c41dfd 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -37,14 +37,14 @@ #ifndef _TIPC_DISCOVER_H #define _TIPC_DISCOVER_H -struct tipc_link_req; +struct tipc_discoverer; int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, struct tipc_media_addr *dest, struct sk_buff **skb); -void tipc_disc_delete(struct tipc_link_req *req); +void tipc_disc_delete(struct tipc_discoverer *req); void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr); -void tipc_disc_add_dest(struct tipc_link_req *req); -void tipc_disc_remove_dest(struct tipc_link_req *req); +void tipc_disc_add_dest(struct tipc_discoverer *req); +void tipc_disc_remove_dest(struct tipc_discoverer *req); void tipc_disc_rcv(struct net *net, struct sk_buff *buf, struct tipc_bearer *b_ptr); -- cgit v1.2.3-70-g09d2 From 2026364149db36c6a2c0c8cae8362fe9a7f954dd Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:47 +0100 Subject: tipc: remove restrictions on node address values Nominally, TIPC organizes network nodes into a three-level network hierarchy consisting of the levels 'zone', 'cluster' and 'node'. This hierarchy is reflected in the node address format, - it is sub-divided into an 8-bit zone id, and 12 bit cluster id, and a 12-bit node id. However, the 'zone' and 'cluster' levels have in reality never been fully implemented,and never will be. The result of this has been that the first 20 bits the node identity structure have been wasted, and the usable node identity range within a cluster has been limited to 12 bits. This is starting to become a problem. In the following commits, we will need to be able to connect between nodes which are using the whole 32-bit value space of the node address. We therefore remove the restrictions on which values can be assigned to node identity, -it is from now on only a 32-bit integer with no assumed internal structure. Isolation between clusters is now achieved only by setting different values for the 'network id' field used during neighbor discovery, in practice leading to the latter becoming the new cluster identity. The rules for accepting discovery requests/responses from neighboring nodes now become: - If the user is using legacy address format on both peers, reception of discovery messages is subject to the legacy lookup domain check in addition to the cluster id check. - Otherwise, the discovery request/response is always accepted, provided both peers have the same network id. This secures backwards compatibility for users who have been using zone or cluster identities as cluster separators, instead of the intended 'network id'. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 50 +------------------------------------------------- net/tipc/addr.h | 11 ----------- net/tipc/bearer.c | 27 ++++----------------------- net/tipc/discover.c | 15 +++++++-------- net/tipc/link.c | 6 ++---- net/tipc/net.c | 4 ++-- net/tipc/node.c | 8 ++------ net/tipc/node.h | 5 +++-- 8 files changed, 21 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 97cd857d7f43..dfc31a730ca5 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -38,21 +38,6 @@ #include "addr.h" #include "core.h" -/** - * in_own_cluster - test for cluster inclusion; <0.0.0> always matches - */ -int in_own_cluster(struct net *net, u32 addr) -{ - return in_own_cluster_exact(net, addr) || !addr; -} - -int in_own_cluster_exact(struct net *net, u32 addr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - - return !((addr ^ tn->own_addr) >> 12); -} - /** * in_own_node - test for node inclusion; <0.0.0> always matches */ @@ -63,46 +48,13 @@ int in_own_node(struct net *net, u32 addr) return (addr == tn->own_addr) || !addr; } -/** - * tipc_addr_domain_valid - validates a network domain address - * - * Accepts , , , and <0.0.0>, - * where Z, C, and N are non-zero. - * - * Returns 1 if domain address is valid, otherwise 0 - */ -int tipc_addr_domain_valid(u32 addr) -{ - u32 n = tipc_node(addr); - u32 c = tipc_cluster(addr); - u32 z = tipc_zone(addr); - - if (n && (!z || !c)) - return 0; - if (c && !z) - return 0; - return 1; -} - -/** - * tipc_addr_node_valid - validates a proposed network address for this node - * - * Accepts , where Z, C, and N are non-zero. - * - * Returns 1 if address can be used, otherwise 0 - */ -int tipc_addr_node_valid(u32 addr) -{ - return tipc_addr_domain_valid(addr) && tipc_node(addr); -} - int tipc_in_scope(u32 domain, u32 addr) { if (!domain || (domain == addr)) return 1; if (domain == tipc_cluster_mask(addr)) /* domain */ return 1; - if (domain == tipc_zone_mask(addr)) /* domain */ + if (domain == (addr & TIPC_ZONE_MASK)) /* domain */ return 1; return 0; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 2ecf5a5d40dd..5ffde51b0e68 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -50,11 +50,6 @@ static inline u32 tipc_own_addr(struct net *net) return tn->own_addr; } -static inline u32 tipc_zone_mask(u32 addr) -{ - return addr & TIPC_ZONE_MASK; -} - static inline u32 tipc_cluster_mask(u32 addr) { return addr & TIPC_ZONE_CLUSTER_MASK; @@ -71,14 +66,8 @@ static inline int tipc_scope2node(struct net *net, int sc) } u32 tipc_own_addr(struct net *net); -int in_own_cluster(struct net *net, u32 addr); -int in_own_cluster_exact(struct net *net, u32 addr); int in_own_node(struct net *net, u32 addr); -u32 addr_domain(struct net *net, u32 sc); -int tipc_addr_domain_valid(u32); -int tipc_addr_node_valid(u32 addr); int tipc_in_scope(u32 domain, u32 addr); -int tipc_addr_scope(u32 domain); char *tipc_addr_string_fill(char *string, u32 addr); #endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 76340b9e4851..a71f31879cb3 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -240,7 +240,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, struct tipc_bearer *b; struct tipc_media *m; struct sk_buff *skb; - char addr_string[16]; int bearer_id = 0; int res = -EINVAL; char *errstr = ""; @@ -256,21 +255,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, goto rejected; } - if (tipc_addr_domain_valid(disc_domain) && disc_domain != self) { - if (tipc_in_scope(disc_domain, self)) { - /* Accept any node in own cluster */ - disc_domain = self & TIPC_ZONE_CLUSTER_MASK; - res = 0; - } else if (in_own_cluster_exact(net, disc_domain)) { - /* Accept specified node in own cluster */ - res = 0; - } - } - if (res) { - errstr = "illegal discovery domain"; - goto rejected; - } - if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) { errstr = "illegal priority"; goto rejected; @@ -354,12 +338,11 @@ static int tipc_enable_bearer(struct net *net, const char *name, return -ENOMEM; } - tipc_addr_string_fill(addr_string, disc_domain); - pr_info("Enabled bearer <%s>, discovery scope %s, priority %u\n", - name, addr_string, prio); + pr_info("Enabled bearer <%s>, priority %u\n", name, prio); + return res; rejected: - pr_warn("Bearer <%s> rejected, %s\n", name, errstr); + pr_warn("Enabling of bearer <%s> rejected, %s\n", name, errstr); return res; } @@ -865,12 +848,10 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) char *bearer; struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); - u32 domain; + u32 domain = 0; u32 prio; prio = TIPC_MEDIA_LINK_PRI; - domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK; if (!info->attrs[TIPC_NLA_BEARER]) return -EINVAL; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 09f75558d353..669af125b3de 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -161,18 +161,17 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (net_id != tn->net_id) return; - if (!tipc_addr_domain_valid(dst)) - return; - if (!tipc_addr_node_valid(src)) - return; if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } - if (!tipc_in_scope(dst, self)) - return; - if (!tipc_in_scope(b->domain, src)) - return; + /* Domain filter only works if both peers use legacy address format */ + if (b->domain) { + if (!tipc_in_scope(dst, self)) + return; + if (!tipc_in_scope(b->domain, src)) + return; + } tipc_node_check_dest(net, src, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) diff --git a/net/tipc/link.c b/net/tipc/link.c index 3c230466804d..86fde005ea47 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -434,7 +434,7 @@ char *tipc_link_name(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, u32 peer, + int window, u32 session, u32 self, u32 peer, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -451,9 +451,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->session = session; /* Note: peer i/f name is completed by reset/activate message */ - sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", - tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), - if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); + sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; diff --git a/net/tipc/net.c b/net/tipc/net.c index 5c4c4405b78e..a074f285e6ea 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -121,7 +121,7 @@ int tipc_net_start(struct net *net, u32 addr) TIPC_CLUSTER_SCOPE, 0, tn->own_addr); pr_info("Started in network mode\n"); - pr_info("Own node address %s, network identity %u\n", + pr_info("Own node address %s, cluster identity %u\n", tipc_addr_string_fill(addr_string, tn->own_addr), tn->net_id); return 0; @@ -238,7 +238,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return -EPERM; addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); - if (!tipc_addr_node_valid(addr)) + if (!addr) return -EINVAL; tipc_net_start(net, addr); diff --git a/net/tipc/node.c b/net/tipc/node.c index 389193d7cf67..8a4b04933ecc 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -233,9 +233,6 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); - if (unlikely(!in_own_cluster_exact(net, addr))) - return NULL; - rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr) @@ -836,10 +833,9 @@ void tipc_node_check_dest(struct net *net, u32 onode, /* Now create new link if not already existing */ if (!l) { - if (n->link_cnt == 2) { - pr_warn("Cannot establish 3rd link to %x\n", n->addr); + if (n->link_cnt == 2) goto exit; - } + if_name = strchr(b->name, ':') + 1; if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, diff --git a/net/tipc/node.h b/net/tipc/node.h index 4ce5e3a185c0..5fb38cf0bb5c 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,13 +49,14 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_MCAST_GROUPS = (1 << 5) + TIPC_NODE_ID32 = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ TIPC_BCAST_STATE_NACK | \ TIPC_BCAST_RCAST | \ - TIPC_BLOCK_FLOWCTL) + TIPC_BLOCK_FLOWCTL | \ + TIPC_NODE_ID32) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -- cgit v1.2.3-70-g09d2 From b89afb116ca2830cc982624f93e888860868a84b Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:48 +0100 Subject: tipc: allow closest-first lookup algorithm when legacy address is configured The removal of an internal structure of the node address has an unwanted side effect. - Currently, if a user is sending an anycast message with destination domain 0, the tipc_namebl_translate() function will use the 'closest- first' algorithm to first look for a node local destination, and only when no such is found, will it resort to the cluster global 'round- robin' lookup algorithm. - Current users can get around this, and enforce unconditional use of global round-robin by indicating a destination as Z.0.0 or Z.C.0. - This option disappears when we make the node address flat, since the lookup algorithm has no way of recognizing this case. So, as long as there are node local destinations, the algorithm will always select one of those, and there is nothing the sender can do to change this. We solve this by eliminating the 'closest-first' option, which was never a good idea anyway, for non-legacy users, but only for those. To distinguish between legacy users and non-legacy users we introduce a new flag 'legacy_addr_format' in struct tipc_core, to be set when the user configures a legacy-style Z.C.N node address. Hence, when a legacy user indicates a zero lookup domain 'closest-first' is selected, and in all other cases we use 'round-robin'. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 12 +++++++----- net/tipc/addr.h | 2 +- net/tipc/core.h | 3 ++- net/tipc/discover.c | 13 ++++++------- net/tipc/name_table.c | 8 +++++--- net/tipc/net.c | 2 +- 6 files changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index dfc31a730ca5..19987994704f 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -48,15 +48,17 @@ int in_own_node(struct net *net, u32 addr) return (addr == tn->own_addr) || !addr; } -int tipc_in_scope(u32 domain, u32 addr) +bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) - return 1; + return true; + if (!legacy_format) + return false; if (domain == tipc_cluster_mask(addr)) /* domain */ - return 1; + return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain */ - return 1; - return 0; + return true; + return false; } char *tipc_addr_string_fill(char *string, u32 addr) diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 5ffde51b0e68..97bdc0e1a369 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -67,7 +67,7 @@ static inline int tipc_scope2node(struct net *net, int sc) u32 tipc_own_addr(struct net *net); int in_own_node(struct net *net, u32 addr); -int tipc_in_scope(u32 domain, u32 addr); +bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); char *tipc_addr_string_fill(char *string, u32 addr); #endif diff --git a/net/tipc/core.h b/net/tipc/core.h index 347f850dc872..bd2b112680a4 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -1,7 +1,7 @@ /* * net/tipc/core.h: Include file for TIPC global declarations * - * Copyright (c) 2005-2006, 2013 Ericsson AB + * Copyright (c) 2005-2006, 2013-2018 Ericsson AB * Copyright (c) 2005-2007, 2010-2013, Wind River Systems * All rights reserved. * @@ -81,6 +81,7 @@ struct tipc_net { u32 own_addr; int net_id; int random; + bool legacy_addr_format; /* Node table and node list */ spinlock_t node_list_lock; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 669af125b3de..82556e19222d 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -139,6 +139,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u16 caps = msg_node_capabilities(hdr); + bool legacy = tn->legacy_addr_format; u32 signature = msg_node_sig(hdr); u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); @@ -165,13 +166,11 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, disc_dupl_alert(b, self, &maddr); return; } - /* Domain filter only works if both peers use legacy address format */ - if (b->domain) { - if (!tipc_in_scope(dst, self)) - return; - if (!tipc_in_scope(b->domain, src)) - return; - } + if (!tipc_in_scope(legacy, dst, self)) + return; + if (!tipc_in_scope(legacy, b->domain, src)) + return; + tipc_node_check_dest(net, src, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bbbfc0702634..7478acb39096 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -499,7 +499,9 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *destnode) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); + bool legacy = tn->legacy_addr_format; + u32 self = tipc_own_addr(net); struct sub_seq *sseq; struct name_info *info; struct publication *publ; @@ -507,7 +509,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 port = 0; u32 node = 0; - if (!tipc_in_scope(*destnode, tn->own_addr)) + if (!tipc_in_scope(legacy, *destnode, self)) return 0; rcu_read_lock(); @@ -521,7 +523,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, info = sseq->info; /* Closest-First Algorithm */ - if (likely(!*destnode)) { + if (legacy && !*destnode) { if (!list_empty(&info->local_publ)) { publ = list_first_entry(&info->local_publ, struct publication, diff --git a/net/tipc/net.c b/net/tipc/net.c index a074f285e6ea..eb0d7a352e3f 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -240,7 +240,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; - + tn->legacy_addr_format = true; tipc_net_start(net, addr); } -- cgit v1.2.3-70-g09d2 From 23fd3eace088ab1872ee59c19191a119ec779ac9 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:49 +0100 Subject: tipc: remove direct accesses to own_addr field in struct tipc_net As a preparation to changing the addressing structure of TIPC we replace all direct accesses to the tipc_net::own_addr field with the function dedicated for this, tipc_own_addr(). There are no changes to program logics in this commit. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 6 +++--- net/tipc/addr.h | 2 +- net/tipc/discover.c | 3 ++- net/tipc/link.c | 9 ++++----- net/tipc/name_distr.c | 11 ++++++----- net/tipc/name_table.c | 6 +++--- net/tipc/net.c | 31 +++++++++++++------------------ net/tipc/socket.c | 23 ++++++++++------------- 8 files changed, 42 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 19987994704f..6e06b4d981f1 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -43,9 +43,7 @@ */ int in_own_node(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - - return (addr == tn->own_addr) || !addr; + return addr == tipc_own_addr(net) || !addr; } bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) @@ -56,6 +54,8 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) return false; if (domain == tipc_cluster_mask(addr)) /* domain */ return true; + if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain */ + return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain */ return true; return false; diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 97bdc0e1a369..6b48f0dc0205 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -45,7 +45,7 @@ static inline u32 tipc_own_addr(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); return tn->own_addr; } diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 82556e19222d..94d524018ca5 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -81,11 +81,12 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); + u32 self = tipc_own_addr(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); - tipc_msg_init(tn->own_addr, hdr, LINK_CONFIG, mtyp, + tipc_msg_init(self, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); diff --git a/net/tipc/link.c b/net/tipc/link.c index 86fde005ea47..4aa56e3bf4fc 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1936,11 +1936,11 @@ msg_full: int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags) { - int err; - void *hdr; + u32 self = tipc_own_addr(net); struct nlattr *attrs; struct nlattr *prop; - struct tipc_net *tn = net_generic(net, tipc_net_id); + void *hdr; + int err; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, nlflags, TIPC_NL_LINK_GET); @@ -1953,8 +1953,7 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name)) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, - tipc_cluster_mask(tn->own_addr))) + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(self))) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 28d095a7d8bb..7e571f4f47bc 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -68,14 +68,14 @@ static void publ_to_item(struct distr_item *i, struct publication *p) static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { - struct tipc_net *tn = net_generic(net, tipc_net_id); struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); + u32 self = tipc_own_addr(net); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); - tipc_msg_init(tn->own_addr, msg, NAME_DISTRIBUTOR, type, - INT_H_SIZE, dest); + tipc_msg_init(self, msg, NAME_DISTRIBUTOR, + type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; @@ -382,13 +382,14 @@ void tipc_named_reinit(struct net *net) struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *publ; + u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); list_for_each_entry_rcu(publ, &nt->node_scope, binding_node) - publ->node = tn->own_addr; + publ->node = self; list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node) - publ->node = tn->own_addr; + publ->node = self; spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 7478acb39096..4359605b1bec 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -540,7 +540,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, } /* Round-Robin Algorithm */ - else if (*destnode == tn->own_addr) { + else if (*destnode == tipc_own_addr(net)) { if (list_empty(&info->local_publ)) goto no_match; publ = list_first_entry(&info->local_publ, struct publication, @@ -713,7 +713,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, } publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope, - tn->own_addr, port_ref, key); + tipc_own_addr(net), port_ref, key); if (likely(publ)) { tn->nametbl->local_publ_count++; buf = tipc_named_publish(net, publ); @@ -738,7 +738,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port, struct tipc_net *tn = net_generic(net, tipc_net_id); spin_lock_bh(&tn->nametbl_lock); - publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr, + publ = tipc_nametbl_remove_publ(net, type, lower, tipc_own_addr(net), port, key); if (likely(publ)) { tn->nametbl->local_publ_count--; diff --git a/net/tipc/net.c b/net/tipc/net.c index eb0d7a352e3f..7f140a5308ee 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -106,7 +106,7 @@ int tipc_net_start(struct net *net, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); char addr_string[16]; tn->own_addr = addr; @@ -117,25 +117,24 @@ int tipc_net_start(struct net *net, u32 addr) tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr, - TIPC_CLUSTER_SCOPE, 0, tn->own_addr); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, cluster identity %u\n", - tipc_addr_string_fill(addr_string, tn->own_addr), + tipc_addr_string_fill(addr_string, addr), tn->net_id); return 0; } void tipc_net_stop(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + u32 self = tipc_own_addr(net); - if (!tn->own_addr) + if (!self) return; - tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0, - tn->own_addr); + tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, 0, self); rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); @@ -202,9 +201,9 @@ out: int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) { - struct net *net = sock_net(skb->sk); - struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); int err; if (!info->attrs[TIPC_NLA_NET]) @@ -216,13 +215,13 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (err) return err; + /* Can't change net id once TIPC has joined a network */ + if (tipc_own_addr(net)) + return -EPERM; + if (attrs[TIPC_NLA_NET_ID]) { u32 val; - /* Can't change net id once TIPC has joined a network */ - if (tn->own_addr) - return -EPERM; - val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); if (val < 1 || val > 9999) return -EINVAL; @@ -233,10 +232,6 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (attrs[TIPC_NLA_NET_ADDR]) { u32 addr; - /* Can't change net addr once TIPC has joined a network */ - if (tn->own_addr) - return -EPERM; - addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 732ec894f69f..275b666f6231 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -289,10 +289,9 @@ static bool tipc_sk_type_connectionless(struct sock *sk) static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) { struct sock *sk = &tsk->sk; - struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); + u32 self = tipc_own_addr(sock_net(sk)); u32 peer_port = tsk_peer_port(tsk); - u32 orig_node; - u32 peer_node; + u32 orig_node, peer_node; if (unlikely(!tipc_sk_connected(sk))) return false; @@ -306,10 +305,10 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg) if (likely(orig_node == peer_node)) return true; - if (!orig_node && (peer_node == tn->own_addr)) + if (!orig_node && peer_node == self) return true; - if (!peer_node && (orig_node == tn->own_addr)) + if (!peer_node && orig_node == self) return true; return false; @@ -461,8 +460,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock, /* Ensure tsk is visible before we read own_addr. */ smp_mb(); - tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG, - NAMED_H_SIZE, 0); + tipc_msg_init(tipc_own_addr(net), msg, TIPC_LOW_IMPORTANCE, + TIPC_NAMED_MSG, NAMED_H_SIZE, 0); msg_set_origport(msg, tsk->portid); timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); @@ -671,7 +670,6 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr; struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id); memset(addr, 0, sizeof(*addr)); if (peer) { @@ -682,7 +680,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr, addr->addr.id.node = tsk_peer_node(tsk); } else { addr->addr.id.ref = tsk->portid; - addr->addr.id.node = tn->own_addr; + addr->addr.id.node = tipc_own_addr(sock_net(sk)); } addr->addrtype = TIPC_ADDR_ID; @@ -2667,8 +2665,8 @@ void tipc_sk_reinit(struct net *net) while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) { spin_lock_bh(&tsk->sk.sk_lock.slock); msg = &tsk->phdr; - msg_set_prevnode(msg, tn->own_addr); - msg_set_orignode(msg, tn->own_addr); + msg_set_prevnode(msg, tipc_own_addr(net)); + msg_set_orignode(msg, tipc_own_addr(net)); spin_unlock_bh(&tsk->sk.sk_lock.slock); } @@ -3167,11 +3165,10 @@ static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock *tsk) { struct net *net = sock_net(skb->sk); - struct tipc_net *tn = tipc_net(net); struct sock *sk = &tsk->sk; if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) || - nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr)) + nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr(net))) return -EMSGSIZE; if (tipc_sk_connected(sk)) { -- cgit v1.2.3-70-g09d2 From d50ccc2d3909fc1b4d40e4af16b026f05dc68707 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:50 +0100 Subject: tipc: add 128-bit node identifier We add a 128-bit node identity, as an alternative to the currently used 32-bit node address. For the sake of compatibility and to minimize message header changes we retain the existing 32-bit address field. When not set explicitly by the user, this field will be filled with a hash value generated from the much longer node identity, and be used as a shorthand value for the latter. We permit either the address or the identity to be set by configuration, but not both, so when the address value is set by a legacy user the corresponding 128-bit node identity is generated based on the that value. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 + net/tipc/addr.c | 81 ++++++++++++++++++++++++++++++++------- net/tipc/addr.h | 28 +++++++++++--- net/tipc/core.c | 4 +- net/tipc/core.h | 6 ++- net/tipc/discover.c | 4 +- net/tipc/link.c | 6 ++- net/tipc/name_distr.c | 6 +-- net/tipc/net.c | 51 ++++++++++++++++-------- net/tipc/net.h | 4 +- net/tipc/node.c | 8 +--- net/tipc/node.h | 4 +- 12 files changed, 148 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d896ded51bcb..0affb682e5e3 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -169,6 +169,8 @@ enum { TIPC_NLA_NET_UNSPEC, TIPC_NLA_NET_ID, /* u32 */ TIPC_NLA_NET_ADDR, /* u32 */ + TIPC_NLA_NET_NODEID, /* u64 */ + TIPC_NLA_NET_NODEID_W1, /* u64 */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 6e06b4d981f1..4841e98591d0 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -1,7 +1,7 @@ /* * net/tipc/addr.c: TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -34,18 +34,9 @@ * POSSIBILITY OF SUCH DAMAGE. */ -#include #include "addr.h" #include "core.h" -/** - * in_own_node - test for node inclusion; <0.0.0> always matches - */ -int in_own_node(struct net *net, u32 addr) -{ - return addr == tipc_own_addr(net) || !addr; -} - bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) @@ -61,9 +52,71 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) return false; } -char *tipc_addr_string_fill(char *string, u32 addr) +void tipc_set_node_id(struct net *net, u8 *id) +{ + struct tipc_net *tn = tipc_net(net); + u32 *tmp = (u32 *)id; + + memcpy(tn->node_id, id, NODE_ID_LEN); + tipc_nodeid2string(tn->node_id_string, id); + tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + pr_info("Own node identity %s, cluster identity %u\n", + tipc_own_id_string(net), tn->net_id); +} + +void tipc_set_node_addr(struct net *net, u32 addr) { - snprintf(string, 16, "<%u.%u.%u>", - tipc_zone(addr), tipc_cluster(addr), tipc_node(addr)); - return string; + struct tipc_net *tn = tipc_net(net); + u8 node_id[NODE_ID_LEN] = {0,}; + + tn->node_addr = addr; + if (!tipc_own_id(net)) { + sprintf(node_id, "%x", addr); + tipc_set_node_id(net, node_id); + } + pr_info("32-bit node address hash set to %x\n", addr); +} + +char *tipc_nodeid2string(char *str, u8 *id) +{ + int i; + u8 c; + + /* Already a string ? */ + for (i = 0; i < NODE_ID_LEN; i++) { + c = id[i]; + if (c >= '0' && c <= '9') + continue; + if (c >= 'A' && c <= 'Z') + continue; + if (c >= 'a' && c <= 'z') + continue; + if (c == '.') + continue; + if (c == ':') + continue; + if (c == '_') + continue; + if (c == '-') + continue; + if (c == '@') + continue; + if (c != 0) + break; + } + if (i == NODE_ID_LEN) { + memcpy(str, id, NODE_ID_LEN); + str[NODE_ID_LEN] = 0; + return str; + } + + /* Translate to hex string */ + for (i = 0; i < NODE_ID_LEN; i++) + sprintf(&str[2 * i], "%02x", id[i]); + + /* Strip off trailing zeroes */ + for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) + str[i] = 0; + + return str; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 6b48f0dc0205..31bee0ea7b3e 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -1,7 +1,7 @@ /* * net/tipc/addr.h: Include file for TIPC address utility routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, Wind River Systems * All rights reserved. * @@ -44,10 +44,22 @@ #include "core.h" static inline u32 tipc_own_addr(struct net *net) +{ + return tipc_net(net)->node_addr; +} + +static inline u8 *tipc_own_id(struct net *net) { struct tipc_net *tn = tipc_net(net); - return tn->own_addr; + if (!strlen(tn->node_id_string)) + return NULL; + return tn->node_id; +} + +static inline char *tipc_own_id_string(struct net *net) +{ + return tipc_net(net)->node_id_string; } static inline u32 tipc_cluster_mask(u32 addr) @@ -65,9 +77,15 @@ static inline int tipc_scope2node(struct net *net, int sc) return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net); } -u32 tipc_own_addr(struct net *net); -int in_own_node(struct net *net, u32 addr); +static inline int in_own_node(struct net *net, u32 addr) +{ + return addr == tipc_own_addr(net) || !addr; +} + bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); -char *tipc_addr_string_fill(char *string, u32 addr); +void tipc_set_node_id(struct net *net, u8 *id); +void tipc_set_node_addr(struct net *net, u32 addr); +char *tipc_nodeid2string(char *str, u8 *id); +u32 tipc_node_id2hash(u8 *id128); #endif diff --git a/net/tipc/core.c b/net/tipc/core.c index 04fd91bb11d7..e92fed49e095 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -56,7 +56,9 @@ static int __net_init tipc_init_net(struct net *net) int err; tn->net_id = 4711; - tn->own_addr = 0; + tn->node_addr = 0; + memset(tn->node_id, 0, sizeof(tn->node_id)); + memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; get_random_bytes(&tn->random, sizeof(int)); INIT_LIST_HEAD(&tn->node_list); diff --git a/net/tipc/core.h b/net/tipc/core.h index bd2b112680a4..eabad41cc832 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -72,13 +72,17 @@ struct tipc_monitor; #define NODE_HTABLE_SIZE 512 #define MAX_BEARERS 3 #define TIPC_DEF_MON_THRESHOLD 32 +#define NODE_ID_LEN 16 +#define NODE_ID_STR_LEN (NODE_ID_LEN * 2 + 1) extern unsigned int tipc_net_id __read_mostly; extern int sysctl_tipc_rmem[3] __read_mostly; extern int sysctl_tipc_named_timeout __read_mostly; struct tipc_net { - u32 own_addr; + u8 node_id[NODE_ID_LEN]; + u32 node_addr; + char node_id_string[NODE_ID_STR_LEN]; int net_id; int random; bool legacy_addr_format; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 94d524018ca5..b4c4cd176b9b 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -118,13 +118,11 @@ static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { - char node_addr_str[16]; char media_addr_str[64]; - tipc_addr_string_fill(node_addr_str, node_addr); tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); - pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str, + pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } diff --git a/net/tipc/link.c b/net/tipc/link.c index 4aa56e3bf4fc..bcd76b1e440c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -442,6 +442,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link) { + char *self_str = tipc_own_id_string(net); struct tipc_link *l; l = kzalloc(sizeof(*l), GFP_ATOMIC); @@ -451,7 +452,10 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->session = session; /* Note: peer i/f name is completed by reset/activate message */ - sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); + if (strlen(self_str) > 16) + sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); + else + sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer); strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 7e571f4f47bc..8240a85b0d0c 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -318,7 +318,6 @@ void tipc_named_process_backlog(struct net *net) { struct distr_queue_item *e, *tmp; struct tipc_net *tn = net_generic(net, tipc_net_id); - char addr[16]; unsigned long now = get_jiffies_64(); list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { @@ -326,12 +325,11 @@ void tipc_named_process_backlog(struct net *net) if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) continue; } else { - tipc_addr_string_fill(addr, e->node); - pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", + pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %x key=%u\n", e->dtype, ntohl(e->i.type), ntohl(e->i.lower), ntohl(e->i.upper), - addr, ntohl(e->i.key)); + e->node, ntohl(e->i.key)); } list_del(&e->next); kfree(e); diff --git a/net/tipc/net.c b/net/tipc/net.c index 7f140a5308ee..e78674891166 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -104,27 +104,31 @@ * - A local spin_lock protecting the queue of subscriber events. */ -int tipc_net_start(struct net *net, u32 addr) +int tipc_net_init(struct net *net, u8 *node_id, u32 addr) { - struct tipc_net *tn = tipc_net(net); - char addr_string[16]; + if (tipc_own_id(net)) { + pr_info("Cannot configure node identity twice\n"); + return -1; + } + pr_info("Started in network mode\n"); - tn->own_addr = addr; + if (node_id) { + tipc_set_node_id(net, node_id); + tipc_net_finalize(net, tipc_own_addr(net)); + } + if (addr) + tipc_net_finalize(net, addr); + return 0; +} - /* Ensure that the new address is visible before we reinit. */ +void tipc_net_finalize(struct net *net, u32 addr) +{ + tipc_set_node_addr(net, addr); smp_mb(); - tipc_named_reinit(net); tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, TIPC_CLUSTER_SCOPE, 0, addr); - - pr_info("Started in network mode\n"); - pr_info("Own node address %s, cluster identity %u\n", - tipc_addr_string_fill(addr_string, addr), - tn->net_id); - return 0; } void tipc_net_stop(struct net *net) @@ -146,8 +150,10 @@ void tipc_net_stop(struct net *net) static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) { struct tipc_net *tn = net_generic(net, tipc_net_id); - void *hdr; + u64 *w0 = (u64 *)&tn->node_id[0]; + u64 *w1 = (u64 *)&tn->node_id[8]; struct nlattr *attrs; + void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NET_GET); @@ -160,7 +166,10 @@ static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg) if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id)) goto attr_msg_full; - + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0)) + goto attr_msg_full; + if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0)) + goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); @@ -212,6 +221,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); + if (err) return err; @@ -236,9 +246,18 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) if (!addr) return -EINVAL; tn->legacy_addr_format = true; - tipc_net_start(net, addr); + tipc_net_init(net, NULL, addr); } + if (attrs[TIPC_NLA_NET_NODEID]) { + u8 node_id[NODE_ID_LEN]; + u64 *w0 = (u64 *)&node_id[0]; + u64 *w1 = (u64 *)&node_id[8]; + + *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); + *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); + tipc_net_init(net, node_id, 0); + } return 0; } diff --git a/net/tipc/net.h b/net/tipc/net.h index c0306aa2374b..08efa6010022 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -41,10 +41,8 @@ extern const struct nla_policy tipc_nl_net_policy[]; -int tipc_net_start(struct net *net, u32 addr); - +void tipc_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); - int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/node.c b/net/tipc/node.c index 8a4b04933ecc..7b0c99347406 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -883,11 +883,9 @@ void tipc_node_delete_links(struct net *net, int bearer_id) static void tipc_node_reset_links(struct tipc_node *n) { - char addr_string[16]; int i; - pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_warn("Resetting all links to %x\n", n->addr); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); @@ -1074,15 +1072,13 @@ illegal_evt: static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { - char addr_string[16]; struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; - pr_debug("Lost contact with %s\n", - tipc_addr_string_fill(addr_string, n->addr)); + pr_debug("Lost contact with %x\n", n->addr); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); diff --git a/net/tipc/node.h b/net/tipc/node.h index 5fb38cf0bb5c..e06faf4fa55e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -49,14 +49,14 @@ enum { TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), TIPC_BCAST_RCAST = (1 << 4), - TIPC_NODE_ID32 = (1 << 5) + TIPC_NODE_ID128 = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ TIPC_BCAST_STATE_NACK | \ TIPC_BCAST_RCAST | \ TIPC_BLOCK_FLOWCTL | \ - TIPC_NODE_ID32) + TIPC_NODE_ID128) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -- cgit v1.2.3-70-g09d2 From 25b0b9c4e835ffaa65b61c3efe2e28acf84d0259 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:51 +0100 Subject: tipc: handle collisions of 32-bit node address hash values When a 32-bit node address is generated from a 128-bit identifier, there is a risk of collisions which must be discovered and handled. We do this as follows: - We don't apply the generated address immediately to the node, but do instead initiate a 1 sec trial period to allow other cluster members to discover and handle such collisions. - During the trial period the node periodically sends out a new type of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast, to all the other nodes in the cluster. - When a node is receiving such a message, it must check that the presented 32-bit identifier either is unused, or was used by the very same peer in a previous session. In both cases it accepts the request by not responding to it. - If it finds that the same node has been up before using a different address, it responds with a DSC_TRIAL_FAIL_MSG containing that address. - If it finds that the address has already been taken by some other node, it generates a new, unused address and returns it to the requester. - During the trial period the requesting node must always be prepared to accept a failure message, i.e., a message where a peer suggests a different (or equal) address to the one tried. In those cases it must apply the suggested value as trial address and restart the trial period. This algorithm ensures that in the vast majority of cases a node will have the same address before and after a reboot. If a legacy user configures the address explicitly, there will be no trial period and messages, so this protocol addition is completely backwards compatible. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 3 +- net/tipc/bearer.c | 3 +- net/tipc/core.c | 2 + net/tipc/core.h | 2 + net/tipc/discover.c | 126 ++++++++++++++++++++++++++++++++++++++++++++-------- net/tipc/link.c | 26 +++++++---- net/tipc/link.h | 4 +- net/tipc/msg.h | 23 +++++++++- net/tipc/net.c | 4 +- net/tipc/node.c | 85 ++++++++++++++++++++++++++++++++--- net/tipc/node.h | 3 +- 11 files changed, 236 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 4841e98591d0..b88d48d00913 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -59,7 +59,7 @@ void tipc_set_node_id(struct net *net, u8 *id) memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); - tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; pr_info("Own node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } @@ -74,6 +74,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } + tn->trial_addr = addr; pr_info("32-bit node address hash set to %x\n", addr); } diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index a71f31879cb3..ae5b44ca1c1e 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -235,7 +235,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, { struct tipc_net *tn = tipc_net(net); struct tipc_bearer_names b_names; - u32 self = tipc_own_addr(net); int with_this_prio = 1; struct tipc_bearer *b; struct tipc_media *m; @@ -244,7 +243,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, int res = -EINVAL; char *errstr = ""; - if (!self) { + if (!tipc_own_id(net)) { errstr = "not supported in standalone mode"; res = -ENOPROTOOPT; goto rejected; diff --git a/net/tipc/core.c b/net/tipc/core.c index e92fed49e095..52dfc51ac4d5 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -57,6 +57,8 @@ static int __net_init tipc_init_net(struct net *net) tn->net_id = 4711; tn->node_addr = 0; + tn->trial_addr = 0; + tn->addr_trial_end = 0; memset(tn->node_id, 0, sizeof(tn->node_id)); memset(tn->node_id_string, 0, sizeof(tn->node_id_string)); tn->mon_threshold = TIPC_DEF_MON_THRESHOLD; diff --git a/net/tipc/core.h b/net/tipc/core.h index eabad41cc832..d0f64ca62d02 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -82,6 +82,8 @@ extern int sysctl_tipc_named_timeout __read_mostly; struct tipc_net { u8 node_id[NODE_ID_LEN]; u32 node_addr; + u32 trial_addr; + unsigned long addr_trial_end; char node_id_string[NODE_ID_STR_LEN]; int net_id; int random; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index b4c4cd176b9b..e7655736abed 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -1,7 +1,7 @@ /* * net/tipc/discover.c * - * Copyright (c) 2003-2006, 2014-2015, Ericsson AB + * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * @@ -78,34 +78,40 @@ struct tipc_discoverer { * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, - u32 mtyp, struct tipc_bearer *b) + u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); - u32 self = tipc_own_addr(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); - tipc_msg_init(self, hdr, LINK_CONFIG, mtyp, + tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); + msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); + msg_set_node_id(hdr, tipc_own_id(net)); } -static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, +static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, + u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { + struct tipc_msg *hdr; struct sk_buff *skb; - skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); + skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; + hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); + msg_set_sugg_node_addr(hdr, sugg_addr); + msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } @@ -126,6 +132,52 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, media_addr_str, b->name); } +/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer + */ +bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, + struct tipc_media_addr *maddr, + struct tipc_bearer *b, + u32 dst, u32 src, + u32 sugg_addr, + u8 *peer_id, + int mtyp) +{ + struct net *net = d->net; + struct tipc_net *tn = tipc_net(net); + bool trial = time_before(jiffies, tn->addr_trial_end); + u32 self = tipc_own_addr(net); + + if (mtyp == DSC_TRIAL_FAIL_MSG) { + if (!trial) + return true; + + /* Ignore if somebody else already gave new suggestion */ + if (dst != tn->trial_addr) + return true; + + /* Otherwise update trial address and restart trial period */ + tn->trial_addr = sugg_addr; + msg_set_prevnode(buf_msg(d->skb), sugg_addr); + tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); + return true; + } + + /* Apply trial address if we just left trial period */ + if (!trial && !self) { + tipc_net_finalize(net, tn->trial_addr); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + } + + if (mtyp != DSC_TRIAL_MSG) + return false; + + sugg_addr = tipc_node_try_addr(net, peer_id, src); + if (sugg_addr) + tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, + self, sugg_addr, maddr, b); + return true; +} + /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace @@ -139,17 +191,27 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_msg *hdr = buf_msg(skb); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; + u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); + u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); - u32 self = tipc_own_addr(net); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; + u32 self; int err; + skb_linearize(skb); + hdr = buf_msg(skb); + + if (caps & TIPC_NODE_ID128) + memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); + else + sprintf(peer_id, "%x", src); + err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { @@ -161,6 +223,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (net_id != tn->net_id) return; + if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, + src, sugg, peer_id, mtyp)) + return; + self = tipc_own_addr(net); + + /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; @@ -169,8 +237,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (!tipc_in_scope(legacy, b->domain, src)) return; - - tipc_node_check_dest(net, src, b, caps, signature, + tipc_node_check_dest(net, src, peer_id, b, caps, signature, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); @@ -178,7 +245,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (mtyp != DSC_REQ_MSG) return; - tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, &maddr, b); + tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes @@ -216,9 +283,11 @@ void tipc_disc_remove_dest(struct tipc_discoverer *d) static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); + struct tipc_net *tn = tipc_net(d->net); + u32 self = tipc_own_addr(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; - struct net *net; + struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); @@ -228,16 +297,29 @@ static void tipc_disc_timeout(struct timer_list *t) d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } + + /* Did we just leave the address trial period ? */ + if (!self && !time_before(jiffies, tn->addr_trial_end)) { + self = tn->trial_addr; + tipc_net_finalize(net, self); + msg_set_prevnode(buf_msg(d->skb), self); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + } + /* Adjust timeout interval according to discovery phase */ - d->timer_intv *= 2; - if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) - d->timer_intv = TIPC_DISC_SLOW; - else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) - d->timer_intv = TIPC_DISC_FAST; + if (time_before(jiffies, tn->addr_trial_end)) { + d->timer_intv = TIPC_DISC_INIT; + } else { + d->timer_intv *= 2; + if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) + d->timer_intv = TIPC_DISC_SLOW; + else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) + d->timer_intv = TIPC_DISC_FAST; + } + mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); - net = d->net; bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); @@ -257,18 +339,24 @@ exit: int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { + struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; - d->skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC); + d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } - tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); + + /* Do we need an address trial period first ? */ + if (!tipc_own_addr(net)) { + tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); + msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); + } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; diff --git a/net/tipc/link.c b/net/tipc/link.c index bcd76b1e440c..1289b4ba404f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -434,15 +434,16 @@ char *tipc_link_name(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, u32 peer, - u16 peer_caps, + int window, u32 session, u32 self, + u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link **link) { - char *self_str = tipc_own_id_string(net); + char peer_str[NODE_ID_STR_LEN] = {0,}; + char self_str[NODE_ID_STR_LEN] = {0,}; struct tipc_link *l; l = kzalloc(sizeof(*l), GFP_ATOMIC); @@ -451,11 +452,18 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, *link = l; l->session = session; - /* Note: peer i/f name is completed by reset/activate message */ - if (strlen(self_str) > 16) - sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer); - else - sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer); + /* Set link name for unicast links only */ + if (peer_id) { + tipc_nodeid2string(self_str, tipc_own_id(net)); + if (strlen(self_str) > 16) + sprintf(self_str, "%x", self); + tipc_nodeid2string(peer_str, peer_id); + if (strlen(peer_str) > 16) + sprintf(peer_str, "%x", peer); + } + /* Peer i/f name will be completed by reset/activate message */ + sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); + strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; @@ -503,7 +511,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, struct tipc_link *l; if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, peer_caps, bc_sndlink, + 0, ownnode, peer, NULL, peer_caps, bc_sndlink, NULL, inputq, namedq, link)) return false; diff --git a/net/tipc/link.h b/net/tipc/link.h index d1bd1787a768..ec59348a81e8 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,8 +73,8 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, u32 peer, - u16 peer_caps, + int window, u32 session, u32 ownnode, + u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, struct sk_buff_head *inputq, diff --git a/net/tipc/msg.h b/net/tipc/msg.h index b4ba1b4f9ae7..a4e944d59394 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -550,6 +550,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 +#define DSC_TRIAL_MSG 2 +#define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types @@ -627,7 +629,6 @@ static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) msg_set_bits(m, 2, 0, 0xffff, n); } - /* * Word 4 */ @@ -925,6 +926,26 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } +static inline u32 msg_sugg_node_addr(struct tipc_msg *m) +{ + return msg_word(m, 14); +} + +static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) +{ + msg_set_word(m, 14, n); +} + +static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) +{ + memcpy(msg_data(hdr), id, 16); +} + +static inline u8 *msg_node_id(struct tipc_msg *hdr) +{ + return (u8 *)msg_data(hdr); +} + struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); diff --git a/net/tipc/net.c b/net/tipc/net.c index e78674891166..29538dc00857 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -112,10 +112,8 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) } pr_info("Started in network mode\n"); - if (node_id) { + if (node_id) tipc_set_node_id(net, node_id); - tipc_net_finalize(net, tipc_own_addr(net)); - } if (addr) tipc_net_finalize(net, addr); return 0; diff --git a/net/tipc/node.c b/net/tipc/node.c index 7b0c99347406..4a95c8c155c6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -115,6 +115,7 @@ struct tipc_node { u16 capabilities; u32 signature; u32 link_id; + u8 peer_id[16]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; @@ -156,6 +157,7 @@ static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); +static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static void tipc_node_put(struct tipc_node *node); static bool node_is_up(struct tipc_node *n); @@ -245,6 +247,30 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) return node; } +/* tipc_node_find_by_id - locate specified node object by its 128-bit id + * Note: this function is called only when a discovery request failed + * to find the node by its 32-bit id, and is not time critical + */ +static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + bool found = false; + + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) { + read_lock_bh(&n->lock); + if (!memcmp(id, n->peer_id, 16) && + kref_get_unless_zero(&n->kref)) + found = true; + read_unlock_bh(&n->lock); + if (found) + break; + } + rcu_read_unlock(); + return found ? n : NULL; +} + static void tipc_node_read_lock(struct tipc_node *n) { read_lock_bh(&n->lock); @@ -307,7 +333,8 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) +struct tipc_node *tipc_node_create(struct net *net, u32 addr, + u8 *peer_id, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; @@ -326,6 +353,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) goto exit; } n->addr = addr; + memcpy(&n->peer_id, peer_id, 16); n->net = net; n->capabilities = capabilities; kref_init(&n->kref); @@ -344,8 +372,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; - if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr, - U16_MAX, + if (!tipc_link_bc_create(net, tipc_own_addr(net), + addr, U16_MAX, tipc_link_window(tipc_bc_sndlink(net)), n->capabilities, &n->bc_entry.inputq1, @@ -735,8 +763,51 @@ bool tipc_node_is_up(struct net *net, u32 addr) return retval; } -void tipc_node_check_dest(struct net *net, u32 onode, - struct tipc_bearer *b, +static u32 tipc_node_suggest_addr(struct net *net, u32 addr) +{ + struct tipc_node *n; + + addr ^= tipc_net(net)->random; + while ((n = tipc_node_find(net, addr))) { + tipc_node_put(n); + addr++; + } + return addr; +} + +/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not + */ +u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_node *n; + + /* Suggest new address if some other peer is using this one */ + n = tipc_node_find(net, addr); + if (n) { + if (!memcmp(n->peer_id, id, NODE_ID_LEN)) + addr = 0; + tipc_node_put(n); + if (!addr) + return 0; + return tipc_node_suggest_addr(net, addr); + } + + /* Suggest previously used address if peer is known */ + n = tipc_node_find_by_id(net, id); + if (n) { + addr = n->addr; + tipc_node_put(n); + } + /* Even this node may be in trial phase */ + if (tn->trial_addr == addr) + return tipc_node_suggest_addr(net, addr); + + return addr; +} + +void tipc_node_check_dest(struct net *net, u32 addr, + u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) @@ -755,7 +826,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, *dupl_addr = false; *respond = false; - n = tipc_node_create(net, onode, capabilities); + n = tipc_node_create(net, addr, peer_id, capabilities); if (!n) return; @@ -840,7 +911,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->window, mod(tipc_net(net)->random), - tipc_own_addr(net), onode, + tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, diff --git a/net/tipc/node.h b/net/tipc/node.h index e06faf4fa55e..f24b83500df1 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -60,7 +60,8 @@ enum { #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); -void tipc_node_check_dest(struct net *net, u32 onode, +u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr); +void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128, struct tipc_bearer *bearer, u16 capabilities, u32 signature, struct tipc_media_addr *maddr, -- cgit v1.2.3-70-g09d2 From 52dfae5c85a4c1078e9f1d5e8947d4a25f73dd81 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 22 Mar 2018 20:42:52 +0100 Subject: tipc: obtain node identity from interface by default Selecting and explicitly configuring a TIPC node identity may be unwanted in some cases. In this commit we introduce a default setting if the identity has not been set at the moment the first bearer is enabled. We do this by using a raw copy of a unique identifier from the used interface: MAC address in the case of an L2 bearer, IPv4/IPv6 address in the case of a UDP bearer. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 24 +++++++++++++++--------- net/tipc/net.h | 1 + net/tipc/udp_media.c | 13 +++++++++++++ 3 files changed, 29 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index ae5b44ca1c1e..f7d47c89d658 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -243,12 +243,6 @@ static int tipc_enable_bearer(struct net *net, const char *name, int res = -EINVAL; char *errstr = ""; - if (!tipc_own_id(net)) { - errstr = "not supported in standalone mode"; - res = -ENOPROTOOPT; - goto rejected; - } - if (!bearer_name_validate(name, &b_names)) { errstr = "illegal name"; goto rejected; @@ -381,11 +375,13 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b) int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, struct nlattr *attr[]) { + char *dev_name = strchr((const char *)b->name, ':') + 1; + int hwaddr_len = b->media->hwaddr_len; + u8 node_id[NODE_ID_LEN] = {0,}; struct net_device *dev; - char *driver_name = strchr((const char *)b->name, ':') + 1; /* Find device with specified name */ - dev = dev_get_by_name(net, driver_name); + dev = dev_get_by_name(net, dev_name); if (!dev) return -ENODEV; if (tipc_mtu_bad(dev, 0)) { @@ -393,6 +389,16 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, return -EINVAL; } + /* Autoconfigure own node identity if needed */ + if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) { + memcpy(node_id, dev->dev_addr, hwaddr_len); + tipc_net_init(net, node_id, 0); + } + if (!tipc_own_id(net)) { + pr_warn("Failed to obtain node identity\n"); + return -EINVAL; + } + /* Associate TIPC bearer with L2 bearer */ rcu_assign_pointer(b->media_ptr, dev); b->pt.dev = dev; @@ -400,7 +406,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, b->pt.func = tipc_l2_rcv_msg; dev_add_pack(&b->pt); memset(&b->bcast_addr, 0, sizeof(b->bcast_addr)); - memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len); + memcpy(b->bcast_addr.value, dev->broadcast, hwaddr_len); b->bcast_addr.media_id = b->media->type_id; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; diff --git a/net/tipc/net.h b/net/tipc/net.h index 08efa6010022..09ad02b50bb1 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -41,6 +41,7 @@ extern const struct nla_policy tipc_nl_net_policy[]; +int tipc_net_init(struct net *net, u8 *node_id, u32 addr); void tipc_net_finalize(struct net *net, u32 addr); void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 3deabcab4882..2c13b18426d9 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -47,6 +47,8 @@ #include #include #include "core.h" +#include "addr.h" +#include "net.h" #include "bearer.h" #include "netlink.h" #include "msg.h" @@ -647,6 +649,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, struct udp_port_cfg udp_conf = {0}; struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; + u8 node_id[NODE_ID_LEN] = {0,}; ub = kzalloc(sizeof(*ub), GFP_ATOMIC); if (!ub) @@ -677,6 +680,16 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (err) goto err; + /* Autoconfigure own node identity if needed */ + if (!tipc_own_id(net)) { + memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16); + tipc_net_init(net, node_id, 0); + } + if (!tipc_own_id(net)) { + pr_warn("Failed to set node id, please configure manually\n"); + return -EINVAL; + } + b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; rcu_assign_pointer(b->media_ptr, ub); -- cgit v1.2.3-70-g09d2 From affaa0c724c14c914625647efe7b95dfbe8d08f2 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 23 Mar 2018 19:09:39 +0100 Subject: net/sched: remove tcf_idr_cleanup() tcf_idr_cleanup() is no more used, so remove it. Suggested-by: Cong Wang Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - net/sched/act_api.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index e0a9c2003b24..9e59ebfded62 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -149,7 +149,6 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats); -void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 57cf37145282..7bd1b964f021 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -296,14 +296,6 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, } EXPORT_SYMBOL(tcf_idr_check); -void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est) -{ - if (est) - gen_kill_estimator(&a->tcfa_rate_est); - free_tcf(a); -} -EXPORT_SYMBOL(tcf_idr_cleanup); - int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) -- cgit v1.2.3-70-g09d2 From 94cb5492409219ee3f9468616dd58af314029f76 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 23 Mar 2018 19:31:30 +0100 Subject: net/sched: act_vlan: declare push_vid with host byte order use u16 in place of __be16 to suppress the following sparse warnings: net/sched/act_vlan.c:150:26: warning: incorrect type in assignment (different base types) net/sched/act_vlan.c:150:26: expected restricted __be16 [usertype] push_vid net/sched/act_vlan.c:150:26: got unsigned short net/sched/act_vlan.c:151:21: warning: restricted __be16 degrades to integer net/sched/act_vlan.c:208:26: warning: incorrect type in assignment (different base types) net/sched/act_vlan.c:208:26: expected unsigned short [unsigned] [usertype] tcfv_push_vid net/sched/act_vlan.c:208:26: got restricted __be16 [usertype] push_vid Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 4595391c2129..41a66effeb5f 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -117,7 +117,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct tc_vlan *parm; struct tcf_vlan *v; int action; - __be16 push_vid = 0; + u16 push_vid = 0; __be16 push_proto = 0; u8 push_prio = 0; bool exists = false; -- cgit v1.2.3-70-g09d2 From 13acc94eff122b260a387d68668bf9d670738e6a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 21 Mar 2018 16:31:03 -0700 Subject: net: permit skb_segment on head_frag frag_list skb One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at function skb_segment(), line 3667. The bpf program attaches to clsact ingress, calls bpf_skb_change_proto to change protocol from ipv4 to ipv6 or from ipv6 to ipv4, and then calls bpf_redirect to send the changed packet out. 3472 struct sk_buff *skb_segment(struct sk_buff *head_skb, 3473 netdev_features_t features) 3474 { 3475 struct sk_buff *segs = NULL; 3476 struct sk_buff *tail = NULL; ... 3665 while (pos < offset + len) { 3666 if (i >= nfrags) { 3667 BUG_ON(skb_headlen(list_skb)); 3668 3669 i = 0; 3670 nfrags = skb_shinfo(list_skb)->nr_frags; 3671 frag = skb_shinfo(list_skb)->frags; 3672 frag_skb = list_skb; ... call stack: ... #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525 #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7 #4 [ffff883ffef03668] die at ffffffff8101deb2 #5 [ffff883ffef03698] do_trap at ffffffff8101a700 #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0 #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab [exception RIP: skb_segment+3044] RIP: ffffffff817e4dd4 RSP: ffff883ffef03860 RFLAGS: 00010216 RAX: 0000000000002bf6 RBX: ffff883feb7aaa00 RCX: 0000000000000011 RDX: ffff883fb87910c0 RSI: 0000000000000011 RDI: ffff883feb7ab500 RBP: ffff883ffef03928 R8: 0000000000002ce2 R9: 00000000000027da R10: 000001ea00000000 R11: 0000000000002d82 R12: ffff883f90a1ee80 R13: ffff883fb8791120 R14: ffff883feb7abc00 R15: 0000000000002ce2 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7 Signed-off-by: David S. Miller --- net/core/skbuff.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 46cb22215ff4..b5c75d4fcf37 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3460,6 +3460,19 @@ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) } EXPORT_SYMBOL_GPL(skb_pull_rcsum); +static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) +{ + skb_frag_t head_frag; + struct page *page; + + page = virt_to_head_page(frag_skb->head); + head_frag.page.p = page; + head_frag.page_offset = frag_skb->data - + (unsigned char *)page_address(page); + head_frag.size = skb_headlen(frag_skb); + return head_frag; +} + /** * skb_segment - Perform protocol segmentation on skb. * @head_skb: buffer to segment @@ -3664,15 +3677,19 @@ normal: while (pos < offset + len) { if (i >= nfrags) { - BUG_ON(skb_headlen(list_skb)); - i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; + if (!skb_headlen(list_skb)) { + BUG_ON(!nfrags); + } else { + BUG_ON(!list_skb->head_frag); - BUG_ON(!nfrags); - + /* to make room for head_frag. */ + i--; + frag--; + } if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -3689,7 +3706,7 @@ normal: goto err; } - *nskb_frag = *frag; + *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); -- cgit v1.2.3-70-g09d2 From da18ab32d78b6414267d3e5c8c9b5ab34a6d3321 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Sat, 24 Mar 2018 03:47:42 +0800 Subject: tipc: tipc_disc_addr_trial_msg() can be static Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: Fengguang Wu Acked-by: Jon Maloy jon.maloy@ericsson.com Signed-off-by: David S. Miller --- net/tipc/discover.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index e7655736abed..9f666e0650e2 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -134,13 +134,13 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer */ -bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, - struct tipc_media_addr *maddr, - struct tipc_bearer *b, - u32 dst, u32 src, - u32 sugg_addr, - u8 *peer_id, - int mtyp) +static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, + struct tipc_media_addr *maddr, + struct tipc_bearer *b, + u32 dst, u32 src, + u32 sugg_addr, + u8 *peer_id, + int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); -- cgit v1.2.3-70-g09d2 From ede2762d93ff16e0974f7446516b46b1022db213 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Mar 2018 19:47:19 +0300 Subject: net: Make NETDEV_XXX commands enum { } This patch is preparation to drop NETDEV_UNREGISTER_FINAL. Since the cmd is used in usnic_ib_netdev_event_to_string() to get cmd name, after plain removing NETDEV_UNREGISTER_FINAL from everywhere, we'd have holes in event2str[] in this function. Instead of that, let's make NETDEV_XXX commands names available for everyone, and to define netdev_cmd_to_name() in the way we won't have to shaffle names after their numbers are changed. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/hw/usnic/usnic_ib_main.c | 15 +------ include/linux/netdevice.h | 69 +++++++++++++++-------------- net/core/dev.c | 20 +++++++++ 3 files changed, 57 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f45e99a938e0..5bf3b20eba25 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -97,20 +97,7 @@ void usnic_ib_log_vf(struct usnic_ib_vf *vf) /* Start of netdev section */ static inline const char *usnic_ib_netdev_event_to_string(unsigned long event) { - const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN", - "NETDEV_REBOOT", "NETDEV_CHANGE", - "NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU", - "NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE", - "NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP", - "NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE", - "NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE", - "NETDEV_NOTIFY_PEERS", "NETDEV_JOIN" - }; - - if (event >= ARRAY_SIZE(event2str)) - return "UNKNOWN_NETDEV_EVENT"; - else - return event2str[event]; + return netdev_cmd_to_name(event); } static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 913b1cc882cf..dd5a04c971d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2312,43 +2312,46 @@ struct netdev_lag_lower_state_info { #include -/* netdevice notifier chain. Please remember to update the rtnetlink - * notification exclusion list in rtnetlink_event() when adding new - * types. +/* netdevice notifier chain. Please remember to update netdev_cmd_to_name() + * and the rtnetlink notification exclusion list in rtnetlink_event() when + * adding new types. */ -#define NETDEV_UP 0x0001 /* For now you can't veto a device up/down */ -#define NETDEV_DOWN 0x0002 -#define NETDEV_REBOOT 0x0003 /* Tell a protocol stack a network interface +enum netdev_cmd { + NETDEV_UP = 1, /* For now you can't veto a device up/down */ + NETDEV_DOWN, + NETDEV_REBOOT, /* Tell a protocol stack a network interface detected a hardware crash and restarted - we can use this eg to kick tcp sessions once done */ -#define NETDEV_CHANGE 0x0004 /* Notify device state change */ -#define NETDEV_REGISTER 0x0005 -#define NETDEV_UNREGISTER 0x0006 -#define NETDEV_CHANGEMTU 0x0007 /* notify after mtu change happened */ -#define NETDEV_CHANGEADDR 0x0008 -#define NETDEV_GOING_DOWN 0x0009 -#define NETDEV_CHANGENAME 0x000A -#define NETDEV_FEAT_CHANGE 0x000B -#define NETDEV_BONDING_FAILOVER 0x000C -#define NETDEV_PRE_UP 0x000D -#define NETDEV_PRE_TYPE_CHANGE 0x000E -#define NETDEV_POST_TYPE_CHANGE 0x000F -#define NETDEV_POST_INIT 0x0010 -#define NETDEV_UNREGISTER_FINAL 0x0011 -#define NETDEV_RELEASE 0x0012 -#define NETDEV_NOTIFY_PEERS 0x0013 -#define NETDEV_JOIN 0x0014 -#define NETDEV_CHANGEUPPER 0x0015 -#define NETDEV_RESEND_IGMP 0x0016 -#define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ -#define NETDEV_CHANGEINFODATA 0x0018 -#define NETDEV_BONDING_INFO 0x0019 -#define NETDEV_PRECHANGEUPPER 0x001A -#define NETDEV_CHANGELOWERSTATE 0x001B -#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C -#define NETDEV_UDP_TUNNEL_DROP_INFO 0x001D -#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E + NETDEV_CHANGE, /* Notify device state change */ + NETDEV_REGISTER, + NETDEV_UNREGISTER, + NETDEV_CHANGEMTU, /* notify after mtu change happened */ + NETDEV_CHANGEADDR, + NETDEV_GOING_DOWN, + NETDEV_CHANGENAME, + NETDEV_FEAT_CHANGE, + NETDEV_BONDING_FAILOVER, + NETDEV_PRE_UP, + NETDEV_PRE_TYPE_CHANGE, + NETDEV_POST_TYPE_CHANGE, + NETDEV_POST_INIT, + NETDEV_UNREGISTER_FINAL, + NETDEV_RELEASE, + NETDEV_NOTIFY_PEERS, + NETDEV_JOIN, + NETDEV_CHANGEUPPER, + NETDEV_RESEND_IGMP, + NETDEV_PRECHANGEMTU, /* notify before mtu change happened */ + NETDEV_CHANGEINFODATA, + NETDEV_BONDING_INFO, + NETDEV_PRECHANGEUPPER, + NETDEV_CHANGELOWERSTATE, + NETDEV_UDP_TUNNEL_PUSH_INFO, + NETDEV_UDP_TUNNEL_DROP_INFO, + NETDEV_CHANGE_TX_QUEUE_LEN, +}; +const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); diff --git a/net/core/dev.c b/net/core/dev.c index f9c28f44286c..055e7ae12759 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1571,6 +1571,26 @@ static void dev_disable_gro_hw(struct net_device *dev) netdev_WARN(dev, "failed to disable GRO_HW!\n"); } +const char *netdev_cmd_to_name(enum netdev_cmd cmd) +{ +#define N(val) \ + case NETDEV_##val: \ + return "NETDEV_" __stringify(val); + switch (cmd) { + N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) + N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) + N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) + N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) + N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) + N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) + N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(UNREGISTER_FINAL) + }; +#undef N + return "UNKNOWN_NETDEV_EVENT"; +} +EXPORT_SYMBOL_GPL(netdev_cmd_to_name); + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { -- cgit v1.2.3-70-g09d2 From 070f2d7e264acd6316fc24092b7f51a18c75ac9c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Mar 2018 19:47:39 +0300 Subject: net: Drop NETDEV_UNREGISTER_FINAL Last user is gone after bdf5bd7f2132 "rds: tcp: remove register_netdevice_notifier infrastructure.", so we can remove this netdevice command. This allows to delete rtnl_lock() in netdev_run_todo(), which is hot path for net namespace unregistration. dev_change_net_namespace() and netdev_wait_allrefs() have rcu_barrier() before NETDEV_UNREGISTER_FINAL call, and the source commits say they were introduced to delemit the call with NETDEV_UNREGISTER, but this patch leaves them on the places, since they require additional analysis, whether we need in them for something else. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/hw/qedr/main.c | 4 ++-- include/linux/netdevice.h | 1 - include/rdma/ib_verbs.h | 4 ++-- net/core/dev.c | 7 ------- 4 files changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index db4bf97c0e15..eb32abb0099a 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -90,8 +90,8 @@ static struct net_device *qedr_get_netdev(struct ib_device *dev, u8 port_num) dev_hold(qdev->ndev); /* The HW vendor's device driver must guarantee - * that this function returns NULL before the net device reaches - * NETDEV_UNREGISTER_FINAL state. + * that this function returns NULL before the net device has finished + * NETDEV_UNREGISTER state. */ return qdev->ndev; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd5a04c971d5..2a2d9cf50aa2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2336,7 +2336,6 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, - NETDEV_UNREGISTER_FINAL, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ff3ed435701f..6eb174753acf 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2122,8 +2122,8 @@ struct ib_device { * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee - * that this function returns NULL before the net device reaches - * NETDEV_UNREGISTER_FINAL state. + * that this function returns NULL before the net device has finished + * NETDEV_UNREGISTER state. */ struct net_device *(*get_netdev)(struct ib_device *device, u8 port_num); diff --git a/net/core/dev.c b/net/core/dev.c index 055e7ae12759..97a96df4b6da 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1584,7 +1584,6 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) - N(UNREGISTER_FINAL) }; #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -8097,7 +8096,6 @@ static void netdev_wait_allrefs(struct net_device *dev) rcu_barrier(); rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { /* We must not have linkwatch events @@ -8169,10 +8167,6 @@ void netdev_run_todo(void) = list_first_entry(&list, struct net_device, todo_list); list_del(&dev->todo_list); - rtnl_lock(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - __rtnl_unlock(); - if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { pr_err("network todo '%s' but state %d\n", dev->name, dev->reg_state); @@ -8614,7 +8608,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); - call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); new_nsid = peernet2id_alloc(dev_net(dev), net); /* If there is an ifindex conflict assign a new one */ -- cgit v1.2.3-70-g09d2 From d6444062f8f07c346a21bd815af4a3dc8b231574 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 23 Mar 2018 15:54:38 -0700 Subject: net: Use octal not symbolic permissions Prefer the direct use of octal for permissions. Done with checkpatch -f --types=SYMBOLIC_PERMS --fix-inplace and some typing. Miscellanea: o Whitespace neatening around these conversions. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/8021q/vlanproc.c | 6 ++--- net/appletalk/atalk_proc.c | 8 +++--- net/atm/atm_sysfs.c | 12 ++++----- net/atm/clip.c | 2 +- net/atm/lec.c | 2 +- net/atm/proc.c | 2 +- net/ax25/af_ax25.c | 6 ++--- net/bluetooth/rfcomm/tty.c | 4 +-- net/bridge/br_sysfs_br.c | 2 +- net/bridge/br_sysfs_if.c | 36 ++++++++++++------------- net/can/af_can.c | 2 +- net/can/gw.c | 2 +- net/ceph/ceph_common.c | 2 +- net/core/net-procfs.c | 6 ++--- net/core/net-sysfs.c | 12 ++++----- net/core/sock.c | 2 +- net/decnet/af_decnet.c | 2 +- net/decnet/dn_dev.c | 2 +- net/decnet/dn_neigh.c | 2 +- net/decnet/dn_route.c | 2 +- net/dns_resolver/dns_key.c | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/fib_trie.c | 6 ++--- net/ipv4/igmp.c | 4 +-- net/ipv4/ipconfig.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- net/ipv4/ping.c | 2 +- net/ipv4/proc.c | 6 ++--- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 4 +-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/anycast.c | 2 +- net/ipv6/ip6_flowlabel.c | 2 +- net/ipv6/mcast.c | 4 +-- net/ipv6/proc.c | 6 ++--- net/ipv6/raw.c | 2 +- net/ipv6/route.c | 2 +- net/kcm/kcmproc.c | 4 +-- net/l2tp/l2tp_ppp.c | 2 +- net/llc/llc_proc.c | 4 +-- net/mac80211/rc80211_minstrel.c | 2 +- net/mac80211/rc80211_minstrel_debugfs.c | 8 +++--- net/mac80211/rc80211_minstrel_ht_debugfs.c | 8 +++--- net/netfilter/nf_conntrack_netbios_ns.c | 2 +- net/netfilter/nf_conntrack_snmp.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- net/netfilter/xt_IDLETIMER.c | 2 +- net/netfilter/xt_recent.c | 4 +-- net/netrom/af_netrom.c | 6 ++--- net/rose/af_rose.c | 8 +++--- net/rxrpc/af_rxrpc.c | 2 +- net/sctp/proc.c | 16 ++++++------ net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/cache.c | 10 +++---- net/sunrpc/debugfs.c | 6 ++--- net/sunrpc/rpc_pipe.c | 42 +++++++++++++++--------------- net/wireless/wext-proc.c | 2 +- net/x25/x25_proc.c | 12 ++++----- net/xfrm/xfrm_proc.c | 2 +- 63 files changed, 161 insertions(+), 161 deletions(-) (limited to 'net') diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index a662ccc166df..a627a5db2125 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -148,8 +148,8 @@ int __net_init vlan_proc_init(struct net *net) if (!vn->proc_vlan_dir) goto err; - vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR, - vn->proc_vlan_dir, &vlan_fops); + vn->proc_vlan_conf = proc_create(name_conf, S_IFREG | 0600, + vn->proc_vlan_dir, &vlan_fops); if (!vn->proc_vlan_conf) goto err; return 0; @@ -172,7 +172,7 @@ int vlan_proc_add_dev(struct net_device *vlandev) if (!strcmp(vlandev->name, name_conf)) return -EINVAL; vlan->dent = - proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR, + proc_create_data(vlandev->name, S_IFREG | 0600, vn->proc_vlan_dir, &vlandev_fops, vlandev); if (!vlan->dent) return -ENOBUFS; diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index a3bf9d519193..7214aea14cb3 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -257,22 +257,22 @@ int __init atalk_proc_init(void) if (!atalk_proc_dir) goto out; - p = proc_create("interface", S_IRUGO, atalk_proc_dir, + p = proc_create("interface", 0444, atalk_proc_dir, &atalk_seq_interface_fops); if (!p) goto out_interface; - p = proc_create("route", S_IRUGO, atalk_proc_dir, + p = proc_create("route", 0444, atalk_proc_dir, &atalk_seq_route_fops); if (!p) goto out_route; - p = proc_create("socket", S_IRUGO, atalk_proc_dir, + p = proc_create("socket", 0444, atalk_proc_dir, &atalk_seq_socket_fops); if (!p) goto out_socket; - p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops); + p = proc_create("arp", 0444, atalk_proc_dir, &atalk_seq_arp_fops); if (!p) goto out_arp; diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c index 5d2fed9f5710..39b94ca5f65d 100644 --- a/net/atm/atm_sysfs.c +++ b/net/atm/atm_sysfs.c @@ -96,12 +96,12 @@ static ssize_t show_link_rate(struct device *cdev, return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate); } -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL); -static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL); -static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); -static DEVICE_ATTR(type, S_IRUGO, show_type, NULL); -static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL); +static DEVICE_ATTR(address, 0444, show_address, NULL); +static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL); +static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL); +static DEVICE_ATTR(carrier, 0444, show_carrier, NULL); +static DEVICE_ATTR(type, 0444, show_type, NULL); +static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL); static struct device_attribute *atm_attrs[] = { &dev_attr_atmaddress, diff --git a/net/atm/clip.c b/net/atm/clip.c index d4f6029d5109..f07dbc632222 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -893,7 +893,7 @@ static int __init atm_clip_init(void) { struct proc_dir_entry *p; - p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops); + p = proc_create("arp", 0444, atm_proc_root, &arp_seq_fops); if (!p) { pr_err("Unable to initialize /proc/net/atm/arp\n"); atm_clip_exit_noproc(); diff --git a/net/atm/lec.c b/net/atm/lec.c index 09a1f056712a..01d5d20a6eb1 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -1042,7 +1042,7 @@ static int __init lane_module_init(void) #ifdef CONFIG_PROC_FS struct proc_dir_entry *p; - p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops); + p = proc_create("lec", 0444, atm_proc_root, &lec_seq_fops); if (!p) { pr_err("Unable to initialize /proc/net/atm/lec\n"); return -ENOMEM; diff --git a/net/atm/proc.c b/net/atm/proc.c index edc48edc95c1..55410c00c7e2 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -474,7 +474,7 @@ int __init atm_proc_init(void) for (e = atm_proc_ents; e->name; e++) { struct proc_dir_entry *dirent; - dirent = proc_create(e->name, S_IRUGO, + dirent = proc_create(e->name, 0444, atm_proc_root, e->proc_fops); if (!dirent) goto err_out_remove; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c8319ed48485..2b41366fcad2 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1989,10 +1989,10 @@ static int __init ax25_init(void) dev_add_pack(&ax25_packet_type); register_netdevice_notifier(&ax25_dev_notifier); - proc_create("ax25_route", S_IRUGO, init_net.proc_net, + proc_create("ax25_route", 0444, init_net.proc_net, &ax25_route_fops); - proc_create("ax25", S_IRUGO, init_net.proc_net, &ax25_info_fops); - proc_create("ax25_calls", S_IRUGO, init_net.proc_net, &ax25_uid_fops); + proc_create("ax25", 0444, init_net.proc_net, &ax25_info_fops); + proc_create("ax25_calls", 0444, init_net.proc_net, &ax25_uid_fops); out: return rc; } diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 5f3074cb6b4d..5e44d842cc5d 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -210,8 +210,8 @@ static ssize_t show_channel(struct device *tty_dev, struct device_attribute *att return sprintf(buf, "%d\n", dev->channel); } -static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); -static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); +static DEVICE_ATTR(address, 0444, show_address, NULL); +static DEVICE_ATTR(channel, 0444, show_channel, NULL); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index b1be0dcfba6b..0318a69888d4 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -893,7 +893,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, static struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, - .mode = S_IRUGO, }, + .mode = 0444, }, .read = brforward_read, }; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 126a8ea73c96..fd31ad83ec7b 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -44,7 +44,7 @@ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ return store_flag(p, v, _mask); \ } \ -static BRPORT_ATTR(_name, S_IRUGO | S_IWUSR, \ +static BRPORT_ATTR(_name, 0644, \ show_##_name, store_##_name) static int store_flag(struct net_bridge_port *p, unsigned long v, @@ -71,7 +71,7 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) return sprintf(buf, "%d\n", p->path_cost); } -static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, +static BRPORT_ATTR(path_cost, 0644, show_path_cost, br_stp_set_path_cost); static ssize_t show_priority(struct net_bridge_port *p, char *buf) @@ -79,91 +79,91 @@ static ssize_t show_priority(struct net_bridge_port *p, char *buf) return sprintf(buf, "%d\n", p->priority); } -static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, +static BRPORT_ATTR(priority, 0644, show_priority, br_stp_set_port_priority); static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_root); } -static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL); +static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL); static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) { return br_show_bridge_id(buf, &p->designated_bridge); } -static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL); +static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_port); } -static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL); +static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->designated_cost); } -static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL); +static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_id); } -static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL); +static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { return sprintf(buf, "0x%x\n", p->port_no); } -static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL); +static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->topology_change_ack); } -static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL); +static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->config_pending); } -static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL); +static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%d\n", p->state); } -static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL); +static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } -static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL); +static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } -static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL); +static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); } -static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); +static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); static int store_flush(struct net_bridge_port *p, unsigned long v) { br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry return 0; } -static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush); +static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { @@ -179,7 +179,7 @@ static int store_group_fwd_mask(struct net_bridge_port *p, return 0; } -static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask, +static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask, store_group_fwd_mask); BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE); @@ -204,7 +204,7 @@ static int store_multicast_router(struct net_bridge_port *p, { return br_multicast_set_port_router(p, v); } -static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router, +static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE); diff --git a/net/can/af_can.c b/net/can/af_can.c index e899970398a1..2f0d0a72e4b5 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -72,7 +72,7 @@ MODULE_AUTHOR("Urs Thuermann , " MODULE_ALIAS_NETPROTO(PF_CAN); static int stats_timer __read_mostly = 1; -module_param(stats_timer, int, S_IRUGO); +module_param(stats_timer, int, 0444); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); static struct kmem_cache *rcv_cache __read_mostly; diff --git a/net/can/gw.c b/net/can/gw.c index 08e97668d5cf..8d71e199d5b3 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -72,7 +72,7 @@ MODULE_ALIAS(CAN_GW_NAME); #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; -module_param(max_hops, uint, S_IRUGO); +module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4d4c82229e9e..4adf07826f4a 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -54,7 +54,7 @@ static const struct kernel_param_ops param_ops_supported_features = { .get = param_get_supported_features, }; module_param_cb(supported_features, ¶m_ops_supported_features, NULL, - S_IRUGO); + 0444); const char *ceph_msg_type_name(int type) { diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 65b51e778782..dd6ae431d038 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -315,12 +315,12 @@ static int __net_init dev_proc_net_init(struct net *net) { int rc = -ENOMEM; - if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops)) + if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops)) goto out; - if (!proc_create("softnet_stat", S_IRUGO, net->proc_net, + if (!proc_create("softnet_stat", 0444, net->proc_net, &softnet_seq_fops)) goto out_dev; - if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops)) + if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops)) goto out_softnet; if (wext_proc_init(net)) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 60a5ad2c33ee..c476f0794132 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -431,7 +431,7 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); -static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { @@ -854,10 +854,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init - = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init - = __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ @@ -1154,7 +1154,7 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue, } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init - = __ATTR(hold_time, S_IRUGO | S_IWUSR, + = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_inflight(struct netdev_queue *queue, @@ -1166,7 +1166,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = - __ATTR(inflight, S_IRUGO, bql_show_inflight, NULL); + __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ @@ -1182,7 +1182,7 @@ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ - = __ATTR(NAME, S_IRUGO | S_IWUSR, \ + = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); diff --git a/net/core/sock.c b/net/core/sock.c index e689496dfd8a..8cee2920a47f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3455,7 +3455,7 @@ static const struct file_operations proto_seq_fops = { static __net_init int proto_init_net(struct net *net) { - if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) + if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops)) return -ENOMEM; return 0; diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 2ee8306c23e3..32751602767f 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -2383,7 +2383,7 @@ static int __init decnet_init(void) dev_add_pack(&dn_dix_packet_type); register_netdevice_notifier(&dn_dev_notifier); - proc_create("decnet", S_IRUGO, init_net.proc_net, &dn_socket_seq_fops); + proc_create("decnet", 0444, init_net.proc_net, &dn_socket_seq_fops); dn_register_sysctl(); out: return rc; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index c9f5e1ebb9c8..c03b046478c3 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -1424,7 +1424,7 @@ void __init dn_dev_init(void) rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR, NULL, dn_nl_dump_ifaddr, 0); - proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops); + proc_create("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_fops); #ifdef CONFIG_SYSCTL { diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 6e37d9e6345e..13156165afa3 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -608,7 +608,7 @@ static const struct file_operations dn_neigh_seq_fops = { void __init dn_neigh_init(void) { neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table); - proc_create("decnet_neigh", S_IRUGO, init_net.proc_net, + proc_create("decnet_neigh", 0444, init_net.proc_net, &dn_neigh_seq_fops); } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index ef20b8e31669..eca0cc6b761f 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1918,7 +1918,7 @@ void __init dn_route_init(void) dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); - proc_create("decnet_cache", S_IRUGO, init_net.proc_net, + proc_create("decnet_cache", 0444, init_net.proc_net, &dn_rt_cache_seq_fops); #ifdef CONFIG_DECNET_ROUTER diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index e1d4d898a007..8396705deffc 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -38,7 +38,7 @@ MODULE_AUTHOR("Wang Lei"); MODULE_LICENSE("GPL"); unsigned int dns_resolver_debug; -module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO); +module_param_named(debug, dns_resolver_debug, uint, 0644); MODULE_PARM_DESC(debug, "DNS Resolver debugging mask"); const struct cred *dns_resolver_cache; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 7dc9de8444a9..4c6ba0fb2630 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1434,7 +1434,7 @@ static const struct file_operations arp_seq_fops = { static int __net_init arp_net_init(struct net *net) { - if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops)) + if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 62243a8abf92..fac0b73e24d1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2722,14 +2722,14 @@ static const struct file_operations fib_route_fops = { int __net_init fib_proc_init(struct net *net) { - if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) + if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops)) goto out1; - if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, + if (!proc_create("fib_triestat", 0444, net->proc_net, &fib_triestat_fops)) goto out2; - if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) + if (!proc_create("route", 0444, net->proc_net, &fib_route_fops)) goto out3; return 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c2743763777e..f17cd83ba164 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2993,10 +2993,10 @@ static int __net_init igmp_net_init(struct net *net) struct proc_dir_entry *pde; int err; - pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); + pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops); if (!pde) goto out_igmp; - pde = proc_create("mcfilter", S_IRUGO, net->proc_net, + pde = proc_create("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f75802ad960f..43f620feb1c4 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void) unsigned int i; #ifdef CONFIG_PROC_FS - proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops); + proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops); #endif /* CONFIG_PROC_FS */ if (!ic_enable) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 0fc88fa7a4dc..31b4cca588d0 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -250,7 +250,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); - c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR, + c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_fops, c); if (!c->pde) { diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 0164def9c808..1f24bc8273a0 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1177,7 +1177,7 @@ static struct ping_seq_afinfo ping_v4_seq_afinfo = { int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo) { struct proc_dir_entry *p; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) return -ENOMEM; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index d97e83b2dd33..80de2e659dcb 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -521,12 +521,12 @@ static const struct file_operations netstat_seq_fops = { static __net_init int ip_proc_init_net(struct net *net) { - if (!proc_create("sockstat", S_IRUGO, net->proc_net, + if (!proc_create("sockstat", 0444, net->proc_net, &sockstat_seq_fops)) goto out_sockstat; - if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops)) + if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops)) goto out_netstat; - if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops)) + if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops)) goto out_snmp; return 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 720bef7da2f6..0ee2501a9027 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1140,7 +1140,7 @@ static const struct file_operations raw_seq_fops = { static __net_init int raw_init_net(struct net *net) { - if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops)) + if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4ac5728689f5..ce9bd5380d21 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -379,12 +379,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; - pde = proc_create("rt_cache", S_IRUGO, net->proc_net, + pde = proc_create("rt_cache", 0444, net->proc_net, &rt_cache_seq_fops); if (!pde) goto err1; - pde = proc_create("rt_cache", S_IRUGO, + pde = proc_create("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_fops); if (!pde) goto err2; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c6aec2643e8..fec8b1fd7b63 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2215,7 +2215,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo) afinfo->seq_ops.next = tcp_seq_next; afinfo->seq_ops.stop = tcp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c6dc019bc64b..fb8f3a36bd14 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2673,7 +2673,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo) afinfo->seq_ops.next = udp_seq_next; afinfo->seq_ops.stop = udp_seq_stop; - p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(afinfo->name, 0444, net->proc_net, afinfo->seq_fops, afinfo); if (!p) rc = -ENOMEM; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 6fd4bbdc444f..9ca90cd9fba4 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4281,7 +4281,7 @@ static const struct file_operations if6_fops = { static int __net_init if6_proc_net_init(struct net *net) { - if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops)) + if (!proc_create("if_inet6", 0444, net->proc_net, &if6_fops)) return -ENOMEM; return 0; } diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index d580d4d456a5..bbcabbba9bd8 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -544,7 +544,7 @@ static const struct file_operations ac6_seq_fops = { int __net_init ac6_proc_init(struct net *net) { - if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops)) + if (!proc_create("anycast6", 0444, net->proc_net, &ac6_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 6ddf52282894..f75b06ba8325 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -844,7 +844,7 @@ static const struct file_operations ip6fl_seq_fops = { static int __net_init ip6_flowlabel_proc_init(struct net *net) { - if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net, + if (!proc_create("ip6_flowlabel", 0444, net->proc_net, &ip6fl_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index d1a0cefac273..1e4c2b6ebd78 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2921,9 +2921,9 @@ static int __net_init igmp6_proc_init(struct net *net) int err; err = -ENOMEM; - if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops)) + if (!proc_create("igmp6", 0444, net->proc_net, &igmp6_mc_seq_fops)) goto out; - if (!proc_create("mcfilter6", S_IRUGO, net->proc_net, + if (!proc_create("mcfilter6", 0444, net->proc_net, &igmp6_mcf_seq_fops)) goto out_proc_net_igmp6; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 1678cf037688..f9891fa672f3 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -290,7 +290,7 @@ int snmp6_register_dev(struct inet6_dev *idev) if (!net->mib.proc_net_devsnmp6) return -ENOENT; - p = proc_create_data(idev->dev->name, S_IRUGO, + p = proc_create_data(idev->dev->name, 0444, net->mib.proc_net_devsnmp6, &snmp6_dev_seq_fops, idev); if (!p) @@ -314,11 +314,11 @@ int snmp6_unregister_dev(struct inet6_dev *idev) static int __net_init ipv6_proc_init_net(struct net *net) { - if (!proc_create("sockstat6", S_IRUGO, net->proc_net, + if (!proc_create("sockstat6", 0444, net->proc_net, &sockstat6_seq_fops)) return -ENOMEM; - if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops)) + if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops)) goto proc_snmp6_fail; net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 10a4ac4933b7..b5e5de732494 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1318,7 +1318,7 @@ static const struct file_operations raw6_seq_fops = { static int __net_init raw6_init_net(struct net *net) { - if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops)) + if (!proc_create("raw6", 0444, net->proc_net, &raw6_seq_fops)) return -ENOMEM; return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a2ed9fdd58d4..1d0eaa69874d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5067,7 +5067,7 @@ static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); - proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); + proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops); #endif return 0; } diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 2c1c8b3e4452..4c2e9907f254 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -269,7 +269,7 @@ static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo) struct proc_dir_entry *p; int rc = 0; - p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net, + p = proc_create_data(muxinfo->name, 0444, net->proc_net, muxinfo->seq_fops, muxinfo); if (!p) rc = -ENOMEM; @@ -406,7 +406,7 @@ static int kcm_proc_init_net(struct net *net) { int err; - if (!proc_create("kcm_stats", S_IRUGO, net->proc_net, + if (!proc_create("kcm_stats", 0444, net->proc_net, &kcm_stats_seq_fops)) { err = -ENOMEM; goto out_kcm_stats; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 977bca659787..f24504efe729 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1742,7 +1742,7 @@ static __net_init int pppol2tp_init_net(struct net *net) struct proc_dir_entry *pde; int err = 0; - pde = proc_create("pppol2tp", S_IRUGO, net->proc_net, + pde = proc_create("pppol2tp", 0444, net->proc_net, &pppol2tp_proc_fops); if (!pde) { err = -ENOMEM; diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 66821e8a2b7a..62ea0aed94b4 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -249,11 +249,11 @@ int __init llc_proc_init(void) if (!llc_proc_dir) goto out; - p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops); + p = proc_create("socket", 0444, llc_proc_dir, &llc_seq_socket_fops); if (!p) goto out_socket; - p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops); + p = proc_create("core", 0444, llc_proc_dir, &llc_seq_core_fops); if (!p) goto out_core; diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 9766c1cc4b0a..8221bc5582ab 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -690,7 +690,7 @@ minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir) #ifdef CONFIG_MAC80211_DEBUGFS mp->fixed_rate_idx = (u32) -1; mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx", - S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); + 0666, debugfsdir, &mp->fixed_rate_idx); #endif minstrel_init_cck_rates(mp); diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c index 36fc971deb86..9ad7d63d3e5b 100644 --- a/net/mac80211/rc80211_minstrel_debugfs.c +++ b/net/mac80211/rc80211_minstrel_debugfs.c @@ -214,11 +214,11 @@ minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { struct minstrel_sta_info *mi = priv_sta; - mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi, - &minstrel_stat_fops); + mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi, + &minstrel_stat_fops); - mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, dir, - mi, &minstrel_stat_csv_fops); + mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi, + &minstrel_stat_csv_fops); } void diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c index 7d969e300fb3..bfcc03152dc6 100644 --- a/net/mac80211/rc80211_minstrel_ht_debugfs.c +++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c @@ -303,10 +303,10 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir) { struct minstrel_ht_sta_priv *msp = priv_sta; - msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp, - &minstrel_ht_stat_fops); - msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, - dir, msp, &minstrel_ht_stat_csv_fops); + msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp, + &minstrel_ht_stat_fops); + msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp, + &minstrel_ht_stat_csv_fops); } void diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 496ce173f0c1..cc11bf890eb9 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -33,7 +33,7 @@ MODULE_ALIAS("ip_conntrack_netbios_ns"); MODULE_ALIAS_NFCT_HELPER("netbios_ns"); static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, S_IRUSR); +module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); static struct nf_conntrack_expect_policy exp_policy = { diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c index 87b95a2c270c..1b18f43ad226 100644 --- a/net/netfilter/nf_conntrack_snmp.c +++ b/net/netfilter/nf_conntrack_snmp.c @@ -26,7 +26,7 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_NFCT_HELPER("snmp"); static unsigned int timeout __read_mostly = 30; -module_param(timeout, uint, S_IRUSR); +module_param(timeout, uint, 0400); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); int (*nf_nat_snmp_hook)(struct sk_buff *skb, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 3cdce391362e..98844c87d01e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -495,7 +495,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(pde, root_uid, root_gid); - pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, + pde = proc_create("nf_conntrack", 0444, net->proc_net_stat, &ct_cpu_seq_fops); if (!pde) goto out_stat_nf_conntrack; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 1ba3da51050d..a964e4d356cc 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -549,7 +549,7 @@ static int __net_init nf_log_net_init(struct net *net) int ret = -ENOMEM; #ifdef CONFIG_PROC_FS - if (!proc_create("nf_log", S_IRUGO, + if (!proc_create("nf_log", 0444, net->nf.proc_netfilter, &nflog_file_ops)) return ret; #endif diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 64b875e452ca..8f16fd27132d 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -325,7 +325,7 @@ static const struct file_operations synproxy_cpu_seq_fops = { static int __net_init synproxy_proc_init(struct net *net) { - if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat, + if (!proc_create("synproxy", 0444, net->proc_net_stat, &synproxy_cpu_seq_fops)) return -ENOMEM; return 0; diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 1ac6600bfafd..5ee859193783 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -132,7 +132,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) ret = -ENOMEM; goto out_free_timer; } - info->timer->attr.attr.mode = S_IRUGO; + info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 486dd24da78b..434e35ce940b 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -51,8 +51,8 @@ static unsigned int ip_list_gid __read_mostly; module_param(ip_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR); -module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR); +module_param(ip_list_uid, uint, 0644); +module_param(ip_list_gid, uint, 0644); MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 35bb6807927f..4221d98a314b 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1450,9 +1450,9 @@ static int __init nr_proto_init(void) nr_loopback_init(); - proc_create("nr", S_IRUGO, init_net.proc_net, &nr_info_fops); - proc_create("nr_neigh", S_IRUGO, init_net.proc_net, &nr_neigh_fops); - proc_create("nr_nodes", S_IRUGO, init_net.proc_net, &nr_nodes_fops); + proc_create("nr", 0444, init_net.proc_net, &nr_info_fops); + proc_create("nr_neigh", 0444, init_net.proc_net, &nr_neigh_fops); + proc_create("nr_nodes", 0444, init_net.proc_net, &nr_nodes_fops); out: return rc; fail: diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 5170373b797c..9ff5e0a76593 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1567,12 +1567,12 @@ static int __init rose_proto_init(void) rose_add_loopback_neigh(); - proc_create("rose", S_IRUGO, init_net.proc_net, &rose_info_fops); - proc_create("rose_neigh", S_IRUGO, init_net.proc_net, + proc_create("rose", 0444, init_net.proc_net, &rose_info_fops); + proc_create("rose_neigh", 0444, init_net.proc_net, &rose_neigh_fops); - proc_create("rose_nodes", S_IRUGO, init_net.proc_net, + proc_create("rose_nodes", 0444, init_net.proc_net, &rose_nodes_fops); - proc_create("rose_routes", S_IRUGO, init_net.proc_net, + proc_create("rose_routes", 0444, init_net.proc_net, &rose_routes_fops); out: return rc; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0c9c18aa7c77..9e1c2c6b6a67 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -32,7 +32,7 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_RXRPC); unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; -module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); +module_param_named(debug, rxrpc_debug, uint, 0644); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); static struct proto rxrpc_proto; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 17d0155d9de3..1d9ccc6dab2b 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -450,17 +450,17 @@ int __net_init sctp_proc_init(struct net *net) net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); if (!net->sctp.proc_net_sctp) return -ENOMEM; - if (!proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_snmp_seq_fops)) + if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp, + &sctp_snmp_seq_fops)) goto cleanup; - if (!proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_eps_seq_fops)) + if (!proc_create("eps", 0444, net->sctp.proc_net_sctp, + &sctp_eps_seq_fops)) goto cleanup; - if (!proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_assocs_seq_fops)) + if (!proc_create("assocs", 0444, net->sctp.proc_net_sctp, + &sctp_assocs_seq_fops)) goto cleanup; - if (!proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp, - &sctp_remaddr_seq_fops)) + if (!proc_create("remaddr", 0444, net->sctp.proc_net_sctp, + &sctp_remaddr_seq_fops)) goto cleanup; return 0; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 26531193fce4..5089dbb96d58 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1375,7 +1375,7 @@ static int create_use_gss_proxy_proc_entry(struct net *net) struct proc_dir_entry **p = &sn->use_gssp_proc; sn->use_gss_proxy = -1; - *p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR, + *p = proc_create_data("use-gss-proxy", S_IFREG | 0600, sn->proc_net_rpc, &use_gss_proxy_ops, net); if (!*p) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 8a7e1c774f9c..c536cc24b3d1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1621,20 +1621,20 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) if (cd->procfs == NULL) goto out_nomem; - p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR, + p = proc_create_data("flush", S_IFREG | 0600, cd->procfs, &cache_flush_operations_procfs, cd); if (p == NULL) goto out_nomem; if (cd->cache_request || cd->cache_parse) { - p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->procfs, &cache_file_operations_procfs, cd); + p = proc_create_data("channel", S_IFREG | 0600, cd->procfs, + &cache_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } if (cd->cache_show) { - p = proc_create_data("content", S_IFREG|S_IRUSR, - cd->procfs, &content_file_operations_procfs, cd); + p = proc_create_data("content", S_IFREG | 0400, cd->procfs, + &content_file_operations_procfs, cd); if (p == NULL) goto out_nomem; } diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e980d2a493de..45a033329cd4 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -139,7 +139,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt) return; /* make tasks file */ - if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs, + if (!debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs, clnt, &tasks_fops)) goto out_err; @@ -241,7 +241,7 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) return; /* make tasks file */ - if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs, + if (!debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops)) { debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; @@ -317,7 +317,7 @@ inject_fault_dir(struct dentry *topdir) if (!faultdir) return NULL; - if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir, + if (!debugfs_create_file("disconnect", S_IFREG | 0400, faultdir, NULL, &fault_disconnect_fops)) return NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index fc97fc3ed637..0f08934b2cea 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -820,13 +820,13 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, { struct dentry *dentry; struct inode *dir = d_inode(parent); - umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR; + umode_t umode = S_IFIFO | 0600; int err; if (pipe->ops->upcall == NULL) - umode &= ~S_IRUGO; + umode &= ~0444; if (pipe->ops->downcall == NULL) - umode &= ~S_IWUGO; + umode &= ~0222; inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); @@ -1035,7 +1035,7 @@ static const struct rpc_filelist authfiles[] = { [RPCAUTH_info] = { .name = "info", .i_fop = &rpc_info_operations, - .mode = S_IFREG | S_IRUSR, + .mode = S_IFREG | 0400, }, }; @@ -1068,8 +1068,8 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry, { struct dentry *ret; - ret = rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL, - rpc_clntdir_populate, rpc_client); + ret = rpc_mkdir_populate(dentry, name, 0555, NULL, + rpc_clntdir_populate, rpc_client); if (!IS_ERR(ret)) { rpc_client->cl_pipedir_objects.pdh_dentry = ret; rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects); @@ -1096,17 +1096,17 @@ static const struct rpc_filelist cache_pipefs_files[3] = { [0] = { .name = "channel", .i_fop = &cache_file_operations_pipefs, - .mode = S_IFREG|S_IRUSR|S_IWUSR, + .mode = S_IFREG | 0600, }, [1] = { .name = "content", .i_fop = &content_file_operations_pipefs, - .mode = S_IFREG|S_IRUSR, + .mode = S_IFREG | 0400, }, [2] = { .name = "flush", .i_fop = &cache_flush_operations_pipefs, - .mode = S_IFREG|S_IRUSR|S_IWUSR, + .mode = S_IFREG | 0600, }, }; @@ -1164,39 +1164,39 @@ enum { static const struct rpc_filelist files[] = { [RPCAUTH_lockd] = { .name = "lockd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_mount] = { .name = "mount", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfs] = { .name = "nfs", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_portmap] = { .name = "portmap", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_statd] = { .name = "statd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd4_cb] = { .name = "nfsd4_cb", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_cache] = { .name = "cache", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_nfsd] = { .name = "nfsd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, [RPCAUTH_gssd] = { .name = "gssd", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, }; @@ -1261,7 +1261,7 @@ EXPORT_SYMBOL_GPL(rpc_put_sb_net); static const struct rpc_filelist gssd_dummy_clnt_dir[] = { [0] = { .name = "clntXX", - .mode = S_IFDIR | S_IRUGO | S_IXUGO, + .mode = S_IFDIR | 0555, }, }; @@ -1310,7 +1310,7 @@ static const struct rpc_filelist gssd_dummy_info_file[] = { [0] = { .name = "info", .i_fop = &rpc_dummy_info_operations, - .mode = S_IFREG | S_IRUSR, + .mode = S_IFREG | 0400, }, }; @@ -1397,7 +1397,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_time_gran = 1; - inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO); + inode = rpc_get_inode(sb, S_IFDIR | 0555); sb->s_root = root = d_make_root(inode); if (!root) return -ENOMEM; diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c index 5511f989ef47..b4c464594a5e 100644 --- a/net/wireless/wext-proc.c +++ b/net/wireless/wext-proc.c @@ -142,7 +142,7 @@ static const struct file_operations wireless_seq_fops = { int __net_init wext_proc_init(struct net *net) { /* Create /proc/net/wireless entry */ - if (!proc_create("wireless", S_IRUGO, net->proc_net, + if (!proc_create("wireless", 0444, net->proc_net, &wireless_seq_fops)) return -ENOMEM; diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 0917f047f2cf..64b415e93f6a 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -212,16 +212,16 @@ int __init x25_proc_init(void) if (!proc_mkdir("x25", init_net.proc_net)) return -ENOMEM; - if (!proc_create("x25/route", S_IRUGO, init_net.proc_net, - &x25_seq_route_fops)) + if (!proc_create("x25/route", 0444, init_net.proc_net, + &x25_seq_route_fops)) goto out; - if (!proc_create("x25/socket", S_IRUGO, init_net.proc_net, - &x25_seq_socket_fops)) + if (!proc_create("x25/socket", 0444, init_net.proc_net, + &x25_seq_socket_fops)) goto out; - if (!proc_create("x25/forward", S_IRUGO, init_net.proc_net, - &x25_seq_forward_fops)) + if (!proc_create("x25/forward", 0444, init_net.proc_net, + &x25_seq_forward_fops)) goto out; return 0; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 6d5f85f4e672..ed06903cd84d 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -79,7 +79,7 @@ static const struct file_operations xfrm_statistics_seq_fops = { int __net_init xfrm_proc_init(struct net *net) { - if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net, + if (!proc_create("xfrm_stat", 0444, net->proc_net, &xfrm_statistics_seq_fops)) return -ENOMEM; return 0; -- cgit v1.2.3-70-g09d2 From 855aeba34047f53e3665b0c3bcac80fe87ee2f7b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Mar 2018 12:28:47 +0300 Subject: net: Convert rpcsec_gss_net_ops These pernet_operations initialize and destroy sunrpc_net_id refered per-net items. Only used global list is cache_list, and accesses already serialized. sunrpc_destroy_cache_detail() check for list_empty() without cache_list_lock, but when it's called from unregister_pernet_subsys(), there can't be callers in parallel, so we won't miss list_empty() in this case. Signed-off-by: Kirill Tkhai Acked-by: Anna Schumaker Signed-off-by: David S. Miller --- net/sunrpc/auth_gss/auth_gss.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 9463af4b32e8..44f939cb6bc8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2063,6 +2063,7 @@ static __net_exit void rpcsec_gss_exit_net(struct net *net) static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, + .async = true, }; /* -- cgit v1.2.3-70-g09d2 From 5e804a6077dccf154047a1ffe5b5232dce579659 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 26 Mar 2018 12:28:55 +0300 Subject: net: Convert sunrpc_net_ops These pernet_operations look similar to rpcsec_gss_net_ops, they just create and destroy another caches. So, they also can be async. Signed-off-by: Kirill Tkhai Acked-by: Anna Schumaker Signed-off-by: David S. Miller --- net/sunrpc/sunrpc_syms.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 56f9eff74150..68287e921847 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -79,6 +79,7 @@ static struct pernet_operations sunrpc_net_ops = { .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), + .async = true, }; static int __init -- cgit v1.2.3-70-g09d2 From bc67a0daf8f3bc6fa8fcb68090f3c444de7f951c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:31 +0300 Subject: ipmr: Make vif fib notifiers common The fib-notifiers are tightly coupled with the vif_device which is already common. Move the notifier struct definition and helpers to the common file; Currently they're only used by ipmr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute.h | 8 ------- include/linux/mroute_base.h | 53 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ipmr.c | 31 +++++--------------------- 3 files changed, 58 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7ed82e4f11b3..3f70a04a5879 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -55,14 +55,6 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) } #endif -struct vif_entry_notifier_info { - struct fib_notifier_info info; - struct net_device *dev; - vifi_t vif_index; - unsigned short vif_flags; - u32 tb_id; -}; - #define VIFF_STATIC 0x8000 struct mfc_cache_cmp_arg { diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index c2560cb50f1d..23326f5402f3 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -6,6 +6,7 @@ #include #include #include +#include /** * struct vif_device - interface representor for multicast routing @@ -36,6 +37,58 @@ struct vif_device { __be32 local, remote; }; +struct vif_entry_notifier_info { + struct fib_notifier_info info; + struct net_device *dev; + unsigned short vif_index; + unsigned short vif_flags; + u32 tb_id; +}; + +static inline int mr_call_vif_notifier(struct notifier_block *nb, + struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct vif_device *vif, + unsigned short vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static inline int mr_call_vif_notifiers(struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct vif_device *vif, + unsigned short vif_index, u32 tb_id, + unsigned int *ipmr_seq) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + (*ipmr_seq)++; + return call_fib_notifiers(net, event_type, &info.info); +} + #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f6be5db16da2..bb1a0655f8e4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -650,18 +650,8 @@ static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, struct vif_device *vif, vifi_t vif_index, u32 tb_id) { - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - return call_fib_notifier(nb, net, event_type, &info.info); + return mr_call_vif_notifier(nb, net, RTNL_FAMILY_IPMR, event_type, + vif, vif_index, tb_id); } static int call_ipmr_vif_entry_notifiers(struct net *net, @@ -669,20 +659,9 @@ static int call_ipmr_vif_entry_notifiers(struct net *net, struct vif_device *vif, vifi_t vif_index, u32 tb_id) { - struct vif_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .dev = vif->dev, - .vif_index = vif_index, - .vif_flags = vif->flags, - .tb_id = tb_id, - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); + return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type, + vif, vif_index, tb_id, + &net->ipv4.ipmr_seq); } static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, -- cgit v1.2.3-70-g09d2 From 54c4cad97b8fd414909b78d4274a6797baa52b3b Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:32 +0300 Subject: ipmr: Make MFC fib notifiers common Like vif notifications, move the notifier struct for MFC as well as its helpers into a common file; Currently they're only used by ipmr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 13 ++++--- include/linux/mroute.h | 6 --- include/linux/mroute_base.h | 44 ++++++++++++++++++++++ net/ipv4/ipmr.c | 26 ++----------- 4 files changed, 56 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 921bd1075edf..8d067de924cf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5391,7 +5391,9 @@ static int mlxsw_sp_router_fibmr_add(struct mlxsw_sp *mlxsw_sp, if (IS_ERR(vr)) return PTR_ERR(vr); - return mlxsw_sp_mr_route4_add(vr->mr4_table, men_info->mfc, replace); + return mlxsw_sp_mr_route4_add(vr->mr4_table, + (struct mfc_cache *) men_info->mfc, + replace); } static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp, @@ -5406,7 +5408,8 @@ static void mlxsw_sp_router_fibmr_del(struct mlxsw_sp *mlxsw_sp, if (WARN_ON(!vr)) return; - mlxsw_sp_mr_route4_del(vr->mr4_table, men_info->mfc); + mlxsw_sp_mr_route4_del(vr->mr4_table, + (struct mfc_cache *) men_info->mfc); mlxsw_sp_vr_put(mlxsw_sp, vr); } @@ -5682,11 +5685,11 @@ static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work) replace); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); - ipmr_cache_put(fib_work->men_info.mfc); + ipmr_cache_put((struct mfc_cache *) fib_work->men_info.mfc); break; case FIB_EVENT_ENTRY_DEL: mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info); - ipmr_cache_put(fib_work->men_info.mfc); + ipmr_cache_put((struct mfc_cache *) fib_work->men_info.mfc); break; case FIB_EVENT_VIF_ADD: err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp, @@ -5766,7 +5769,7 @@ mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work, case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info)); - ipmr_cache_hold(fib_work->men_info.mfc); + ipmr_cache_hold((struct mfc_cache *) fib_work->men_info.mfc); break; case FIB_EVENT_VIF_ADD: /* fall through */ case FIB_EVENT_VIF_DEL: diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 3f70a04a5879..c855d80b51f7 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -80,12 +80,6 @@ struct mfc_cache { }; }; -struct mfc_entry_notifier_info { - struct fib_notifier_info info; - struct mfc_cache *mfc; - u32 tb_id; -}; - struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 23326f5402f3..2c594686c05e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -152,6 +152,50 @@ struct mr_mfc { struct rcu_head rcu; }; +struct mfc_entry_notifier_info { + struct fib_notifier_info info; + struct mr_mfc *mfc; + u32 tb_id; +}; + +static inline int mr_call_mfc_notifier(struct notifier_block *nb, + struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct mr_mfc *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static inline int mr_call_mfc_notifiers(struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct mr_mfc *mfc, u32 tb_id, + unsigned int *ipmr_seq) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + (*ipmr_seq)++; + return call_fib_notifiers(net, event_type, &info.info); +} + struct mr_table; /** diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bb1a0655f8e4..470956d2d8ad 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -669,34 +669,16 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - return call_fib_notifier(nb, net, event_type, &info.info); + return mr_call_mfc_notifier(nb, net, RTNL_FAMILY_IPMR, + event_type, &mfc->_c, tb_id); } static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) { - struct mfc_entry_notifier_info info = { - .info = { - .family = RTNL_FAMILY_IPMR, - .net = net, - }, - .mfc = mfc, - .tb_id = tb_id - }; - - ASSERT_RTNL(); - net->ipv4.ipmr_seq++; - return call_fib_notifiers(net, event_type, &info.info); + return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type, + &mfc->_c, tb_id, &net->ipv4.ipmr_seq); } /** -- cgit v1.2.3-70-g09d2 From cdc9f9443b5c3a61c7cec807965054ee1fd29acf Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:33 +0300 Subject: ipmr: Make ipmr_dump() common Since all the primitive elements used for the notification done by ipmr are now common [mr_table, mr_mfc, vif_device] we can refactor the logic for dumping them to a common file. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 18 +++++++++++++++ net/ipv4/ipmr.c | 53 ++------------------------------------------- net/ipv4/ipmr_base.c | 42 +++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 2c594686c05e..289eb5aa7b5d 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -277,6 +277,13 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock); + +int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -333,6 +340,17 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, { return -EINVAL; } + +static inline int mr_dump(struct net *net, struct notifier_block *nb, + unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock) +{ + return -EINVAL; +} #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 470956d2d8ad..24c340021aba 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -644,16 +644,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) } #endif -static int call_ipmr_vif_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct vif_device *vif, - vifi_t vif_index, u32 tb_id) -{ - return mr_call_vif_notifier(nb, net, RTNL_FAMILY_IPMR, event_type, - vif, vif_index, tb_id); -} - static int call_ipmr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, @@ -664,15 +654,6 @@ static int call_ipmr_vif_entry_notifiers(struct net *net, &net->ipv4.ipmr_seq); } -static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb, - struct net *net, - enum fib_event_type event_type, - struct mfc_cache *mfc, u32 tb_id) -{ - return mr_call_mfc_notifier(nb, net, RTNL_FAMILY_IPMR, - event_type, &mfc->_c, tb_id); -} - static int call_ipmr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc_cache *mfc, u32 tb_id) @@ -2950,38 +2931,8 @@ static unsigned int ipmr_seq_read(struct net *net) static int ipmr_dump(struct net *net, struct notifier_block *nb) { - struct mr_table *mrt; - int err; - - err = ipmr_rules_dump(net, nb); - if (err) - return err; - - ipmr_for_each_table(mrt, net) { - struct vif_device *v = &mrt->vif_table[0]; - struct mr_mfc *mfc; - int vifi; - - /* Notifiy on table VIF entries */ - read_lock(&mrt_lock); - for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { - if (!v->dev) - continue; - - call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD, - v, vifi, mrt->id); - } - read_unlock(&mrt_lock); - - /* Notify on table MFC entries */ - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) - call_ipmr_mfc_entry_notifier(nb, net, - FIB_EVENT_ENTRY_ADD, - (struct mfc_cache *)mfc, - mrt->id); - } - - return 0; + return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump, + ipmr_mr_table_iter, &mrt_lock); } static const struct fib_notifier_ops ipmr_notifier_ops_template = { diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 8ba55bfda817..4fe97723b53f 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -321,3 +321,45 @@ done: return skb->len; } EXPORT_SYMBOL(mr_rtm_dumproute); + +int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock) +{ + struct mr_table *mrt; + int err; + + err = rules_dump(net, nb); + if (err) + return err; + + for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { + struct vif_device *v = &mrt->vif_table[0]; + struct mr_mfc *mfc; + int vifi; + + /* Notifiy on table VIF entries */ + read_lock(mrt_lock); + for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { + if (!v->dev) + continue; + + mr_call_vif_notifier(nb, net, family, + FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); + } + read_unlock(mrt_lock); + + /* Notify on table MFC entries */ + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) + mr_call_mfc_notifier(nb, net, family, + FIB_EVENT_ENTRY_ADD, + mfc, mrt->id); + } + + return 0; +} +EXPORT_SYMBOL(mr_dump); -- cgit v1.2.3-70-g09d2 From 088aa3eec2ce340b5d0f0f54430f5706223d5e45 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:34 +0300 Subject: ip6mr: Support fib notifications In similar fashion to ipmr, support fib notifications for ip6mr mfc and vif related events. This would later allow drivers to react to said notifications and offload the IPv6 mroutes. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 2 + net/ipv6/ip6mr.c | 112 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5b51110435fc..c29f09cfc9d7 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -96,6 +96,8 @@ struct netns_ipv6 { atomic_t fib6_sernum; struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; + struct fib_notifier_ops *ip6mr_notifier_ops; + unsigned int ipmr_seq; /* protected by rtnl_mutex */ struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7345bd6c4b7d..0be2f333e168 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -258,6 +258,16 @@ static void __net_exit ip6mr_rules_exit(struct net *net) fib_rules_unregister(net->ipv6.mr6_rules_ops); rtnl_unlock(); } + +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR); +} + +static unsigned int ip6mr_rules_seq_read(struct net *net) +{ + return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); +} #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) @@ -295,6 +305,16 @@ static void __net_exit ip6mr_rules_exit(struct net *net) net->ipv6.mrt6 = NULL; rtnl_unlock(); } + +static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb) +{ + return 0; +} + +static unsigned int ip6mr_rules_seq_read(struct net *net) +{ + return 0; +} #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, @@ -653,10 +673,25 @@ failure: } #endif -/* - * Delete a VIF entry - */ +static int call_ip6mr_vif_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct vif_device *vif, + mifi_t vif_index, u32 tb_id) +{ + return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, + vif, vif_index, tb_id, + &net->ipv6.ipmr_seq); +} +static int call_ip6mr_mfc_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct mfc6_cache *mfc, u32 tb_id) +{ + return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, + &mfc->_c, tb_id, &net->ipv6.ipmr_seq); +} + +/* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { @@ -669,6 +704,11 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, v = &mrt->vif_table[vifi]; + if (VIF_EXISTS(mrt, vifi)) + call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_VIF_DEL, v, vifi, + mrt->id); + write_lock_bh(&mrt_lock); dev = v->dev; v->dev = NULL; @@ -887,6 +927,8 @@ static int mif6_add(struct net *net, struct mr_table *mrt, if (vifi + 1 > mrt->maxvif) mrt->maxvif = vifi + 1; write_unlock_bh(&mrt_lock); + call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, + v, vifi, mrt->id); return 0; } @@ -1175,6 +1217,8 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); ip6mr_cache_free(c); return 0; @@ -1203,21 +1247,63 @@ static int ip6mr_device_event(struct notifier_block *this, return NOTIFY_DONE; } +static unsigned int ip6mr_seq_read(struct net *net) +{ + ASSERT_RTNL(); + + return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); +} + +static int ip6mr_dump(struct net *net, struct notifier_block *nb) +{ + return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, + ip6mr_mr_table_iter, &mrt_lock); +} + static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; -/* - * Setup for IP multicast routing - */ +static const struct fib_notifier_ops ip6mr_notifier_ops_template = { + .family = RTNL_FAMILY_IP6MR, + .fib_seq_read = ip6mr_seq_read, + .fib_dump = ip6mr_dump, + .owner = THIS_MODULE, +}; + +static int __net_init ip6mr_notifier_init(struct net *net) +{ + struct fib_notifier_ops *ops; + + net->ipv6.ipmr_seq = 0; + ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); + + net->ipv6.ip6mr_notifier_ops = ops; + + return 0; +} + +static void __net_exit ip6mr_notifier_exit(struct net *net) +{ + fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); + net->ipv6.ip6mr_notifier_ops = NULL; +} + +/* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; + err = ip6mr_notifier_init(net); + if (err) + return err; + err = ip6mr_rules_init(net); if (err < 0) - goto fail; + goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; @@ -1235,7 +1321,8 @@ proc_cache_fail: proc_vif_fail: ip6mr_rules_exit(net); #endif -fail: +ip6mr_rules_fail: + ip6mr_notifier_exit(net); return err; } @@ -1246,6 +1333,7 @@ static void __net_exit ip6mr_net_exit(struct net *net) remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_rules_exit(net); + ip6mr_notifier_exit(net); } static struct pernet_operations ip6mr_net_ops = { @@ -1337,6 +1425,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; write_unlock_bh(&mrt_lock); + call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, + c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1388,6 +1478,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } + call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, + c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } @@ -1424,6 +1516,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, + (struct mfc6_cache *)c, + mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); -- cgit v1.2.3-70-g09d2 From d3c07e5b9939a055fa017f200e535ae947eb22ab Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:35 +0300 Subject: ip6mr: Add API for default_rule fib Add the ability to discern whether a given FIB rule notification relates to the default rule inserted when registering ip6mr or a different one. Would later be used by drivers wishing to offload ipv6 multicast routes but unable to offload rules other than the default one. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute6.h | 10 ++++++++++ net/ipv6/ip6mr.c | 7 +++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 1ac38e6819f5..c4a45859f586 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -63,6 +64,15 @@ static inline void ip6_mr_cleanup(void) } #endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +bool ip6mr_rule_default(const struct fib_rule *rule); +#else +static inline bool ip6mr_rule_default(const struct fib_rule *rule) +{ + return true; +} +#endif + #define VIFF_STATIC 0x8000 struct mfc6_cache_cmp_arg { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 0be2f333e168..a187c523a95f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -268,6 +268,13 @@ static unsigned int ip6mr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } + +bool ip6mr_rule_default(const struct fib_rule *rule) +{ + return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && + rule->table == RT6_TABLE_DFLT && !rule->l3mdev; +} +EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) -- cgit v1.2.3-70-g09d2 From 8c13af2a219c6498071b30ea558438c74267ae4d Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:36 +0300 Subject: ip6mr: Add refcounting to mfc Since ipmr and ip6mr are using the same mr_mfc struct at their core, we can now refactor the ipmr_cache_{hold,put} logic and apply refcounting to both ipmr and ip6mr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c | 6 +++--- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 6 +++--- include/linux/mroute.h | 19 ------------------- include/linux/mroute_base.h | 13 +++++++++++++ net/ipv4/ipmr.c | 8 ++++---- net/ipv6/ip6mr.c | 6 ++++-- 6 files changed, 27 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index 978a3c70653a..51b104ae2eec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -360,7 +360,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, /* Find min_mtu and link iVIF and eVIFs */ mr_route->min_mtu = ETH_MAX_MTU; - ipmr_cache_hold(mfc); + mr_cache_hold(&mfc->_c); mr_route->mfc4 = mfc; mr_route->mr_table = mr_table; for (i = 0; i < MAXVIFS; i++) { @@ -380,7 +380,7 @@ mlxsw_sp_mr_route4_create(struct mlxsw_sp_mr_table *mr_table, mr_route->route_action = mlxsw_sp_mr_route_action(mr_route); return mr_route; err: - ipmr_cache_put(mfc); + mr_cache_put(&mfc->_c); list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node) mlxsw_sp_mr_route_evif_unlink(rve); kfree(mr_route); @@ -393,7 +393,7 @@ static void mlxsw_sp_mr_route4_destroy(struct mlxsw_sp_mr_table *mr_table, struct mlxsw_sp_mr_route_vif_entry *rve, *tmp; mlxsw_sp_mr_route_ivif_unlink(mr_route); - ipmr_cache_put(mr_route->mfc4); + mr_cache_put((struct mr_mfc *)mr_route->mfc4); list_for_each_entry_safe(rve, tmp, &mr_route->evif_list, route_node) mlxsw_sp_mr_route_evif_unlink(rve); kfree(mr_route); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 8d067de924cf..be241c708bb9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -5685,11 +5685,11 @@ static void mlxsw_sp_router_fibmr_event_work(struct work_struct *work) replace); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); - ipmr_cache_put((struct mfc_cache *) fib_work->men_info.mfc); + mr_cache_put(fib_work->men_info.mfc); break; case FIB_EVENT_ENTRY_DEL: mlxsw_sp_router_fibmr_del(mlxsw_sp, &fib_work->men_info); - ipmr_cache_put((struct mfc_cache *) fib_work->men_info.mfc); + mr_cache_put(fib_work->men_info.mfc); break; case FIB_EVENT_VIF_ADD: err = mlxsw_sp_router_fibmr_vif_add(mlxsw_sp, @@ -5769,7 +5769,7 @@ mlxsw_sp_router_fibmr_event(struct mlxsw_sp_fib_event_work *fib_work, case FIB_EVENT_ENTRY_ADD: /* fall through */ case FIB_EVENT_ENTRY_DEL: memcpy(&fib_work->men_info, info, sizeof(fib_work->men_info)); - ipmr_cache_hold((struct mfc_cache *) fib_work->men_info.mfc); + mr_cache_hold(fib_work->men_info.mfc); break; case FIB_EVENT_VIF_ADD: /* fall through */ case FIB_EVENT_VIF_DEL: diff --git a/include/linux/mroute.h b/include/linux/mroute.h index c855d80b51f7..9a36fad9e068 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -84,23 +84,4 @@ struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); - -#ifdef CONFIG_IP_MROUTE -void ipmr_cache_free(struct mfc_cache *mfc_cache); -#else -static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) -{ -} -#endif - -static inline void ipmr_cache_put(struct mfc_cache *c) -{ - if (refcount_dec_and_test(&c->_c.mfc_un.res.refcount)) - ipmr_cache_free(c); -} -static inline void ipmr_cache_hold(struct mfc_cache *c) -{ - refcount_inc(&c->_c.mfc_un.res.refcount); -} - #endif diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 289eb5aa7b5d..d617fe45543e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -125,6 +125,7 @@ enum { * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction + * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; @@ -150,8 +151,20 @@ struct mr_mfc { } mfc_un; struct list_head list; struct rcu_head rcu; + void (*free)(struct rcu_head *head); }; +static inline void mr_cache_put(struct mr_mfc *c) +{ + if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + call_rcu(&c->rcu, c->free); +} + +static inline void mr_cache_hold(struct mr_mfc *c) +{ + refcount_inc(&c->mfc_un.res.refcount); +} + struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 24c340021aba..e79211a8537c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -732,11 +732,10 @@ static void ipmr_cache_free_rcu(struct rcu_head *head) kmem_cache_free(mrt_cachep, (struct mfc_cache *)c); } -void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free(struct mfc_cache *c) { call_rcu(&c->_c.rcu, ipmr_cache_free_rcu); } -EXPORT_SYMBOL(ipmr_cache_free); /* Destroy an unresolved cache entry, killing queued skbs * and reporting error to netlink readers. @@ -987,6 +986,7 @@ static struct mfc_cache *ipmr_cache_alloc(void) if (c) { c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXVIFS; + c->_c.free = ipmr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); } return c; @@ -1206,7 +1206,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) list_del_rcu(&c->_c.list); call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id); mroute_netlink_event(mrt, c, RTM_DELROUTE); - ipmr_cache_put(c); + mr_cache_put(&c->_c); return 0; } @@ -1318,7 +1318,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); - ipmr_cache_put(cache); + mr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a187c523a95f..1c8fa29d155a 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -989,6 +989,8 @@ static struct mfc6_cache *ip6mr_cache_alloc(void) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; + c->_c.free = ip6mr_cache_free_rcu; + refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } @@ -1227,7 +1229,7 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); - ip6mr_cache_free(c); + mr_cache_put(&c->_c); return 0; } @@ -1516,7 +1518,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); - ip6mr_cache_free((struct mfc6_cache *)c); + mr_cache_put(c); } if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { -- cgit v1.2.3-70-g09d2 From e1577c1c881b09e9f15a743a4a1907815b74d0f7 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Mon, 20 Nov 2017 16:14:30 +0200 Subject: ethtool: Add support for configuring PFC stall prevention in ethtool In the event where the device unexpectedly becomes unresponsive for a long period of time, flow control mechanism may propagate pause frames which will cause congestion spreading to the entire network. To prevent this scenario, when the device is stalled for a period longer than a pre-configured timeout, flow control mechanisms are automatically disabled. This patch adds support for the ETHTOOL_PFC_STALL_PREVENTION as a tunable. This API provides support for configuring flow control storm prevention timeout (msec). Signed-off-by: Inbar Karmy Cc: Michal Kubecek Cc: Andrew Lunn Signed-off-by: Saeed Mahameed --- include/uapi/linux/ethtool.h | 4 ++++ net/core/ethtool.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 20da156aaf64..4ca65b56084f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -217,10 +217,14 @@ struct ethtool_value { __u32 data; }; +#define PFC_STORM_PREVENTION_AUTO 0xffff +#define PFC_STORM_PREVENTION_DISABLE 0 + enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, ETHTOOL_TX_COPYBREAK, + ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */ /* * Add your fresh new tubale attribute above and remember to update * tunable_strings[] in net/core/ethtool.c diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 157cd9efa4be..bb6e498c6e3d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -121,6 +121,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", + [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", }; static const char @@ -2311,6 +2312,11 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (tuna->len != sizeof(u16) || + tuna->type_id != ETHTOOL_TUNABLE_U16) + return -EINVAL; + break; default: return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 5306653850b444452937834adc5a5ac63bae275e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 26 Mar 2018 16:55:00 +0800 Subject: sctp: remove unnecessary asoc in sctp_has_association After Commit dae399d7fdee ("sctp: hold transport instead of assoc when lookup assoc in rx path"), it put transport instead of asoc in sctp_has_association. Variable 'asoc' is not used any more. So this patch is to remove it, while at it, it also changes the return type of sctp_has_association to bool, and does the same for it's caller sctp_endpoint_is_peeled_off. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 8 ++++---- net/sctp/endpointola.c | 8 ++++---- net/sctp/input.c | 13 ++++++------- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 012fb3e2f4cf..c63249ea34c3 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1341,12 +1341,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **); -int sctp_endpoint_is_peeled_off(struct sctp_endpoint *, - const union sctp_addr *); +bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, + const union sctp_addr *paddr); struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *, struct net *, const union sctp_addr *); -int sctp_has_association(struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); +bool sctp_has_association(struct net *net, const union sctp_addr *laddr, + const union sctp_addr *paddr); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 8b3146816519..e2f5a3ee41a7 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -349,8 +349,8 @@ out: /* Look for any peeled off association from the endpoint that matches the * given peer address. */ -int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, - const union sctp_addr *paddr) +bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, + const union sctp_addr *paddr) { struct sctp_sockaddr_entry *addr; struct sctp_bind_addr *bp; @@ -362,10 +362,10 @@ int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, */ list_for_each_entry(addr, &bp->address_list, list) { if (sctp_has_association(net, &addr->a, paddr)) - return 1; + return true; } - return 0; + return false; } /* Do delayed input processing. This is scheduled by sctp_rcv(). diff --git a/net/sctp/input.c b/net/sctp/input.c index b381d78548ac..ba8a6e6c36fa 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -1010,19 +1010,18 @@ struct sctp_association *sctp_lookup_association(struct net *net, } /* Is there an association matching the given local and peer addresses? */ -int sctp_has_association(struct net *net, - const union sctp_addr *laddr, - const union sctp_addr *paddr) +bool sctp_has_association(struct net *net, + const union sctp_addr *laddr, + const union sctp_addr *paddr) { - struct sctp_association *asoc; struct sctp_transport *transport; - if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) { + if (sctp_lookup_association(net, laddr, paddr, &transport)) { sctp_transport_put(transport); - return 1; + return true; } - return 0; + return false; } /* -- cgit v1.2.3-70-g09d2 From 8daf1a2d7e40685054ebae680733d822ced6df62 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 26 Mar 2018 12:27:12 +0100 Subject: net/ncsi: check for null return from call to nla_nest_start The call to nla_nest_start calls nla_put which can lead to a NULL return so it's possible for attr to become NULL and we can potentially get a NULL pointer dereference on attr. Fix this by checking for a NULL return. Detected by CoverityScan, CID#1466125 ("Dereference null return") Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ncsi/ncsi-netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 05fcfb4fbe1d..8d7e849d4825 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -190,6 +190,10 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST); + if (!attr) { + kfree_skb(skb); + return -EMSGSIZE; + } rc = ncsi_write_package_info(skb, ndp, package_id); if (rc) { -- cgit v1.2.3-70-g09d2 From c76f2481b60a62814f4cb1678e48efa3385895aa Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 26 Mar 2018 14:32:44 +0000 Subject: tipc: fix error handling in tipc_udp_enable() Release alloced resource before return from the error handling case in tipc_udp_enable(), otherwise will cause memory leak. Fixes: 52dfae5c85a4 ("tipc: obtain node identity from interface by default") Signed-off-by: Wei Yongjun Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/udp_media.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 2c13b18426d9..e7d91f5d5cae 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -687,7 +687,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, } if (!tipc_own_id(net)) { pr_warn("Failed to set node id, please configure manually\n"); - return -EINVAL; + err = -EINVAL; + goto err; } b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP; -- cgit v1.2.3-70-g09d2 From e1a22d13eb1f302afd692583777e27828d375a39 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 26 Mar 2018 14:33:13 +0000 Subject: tipc: tipc_node_create() can be static Fixes the following sparse warning: net/tipc/node.c:336:18: warning: symbol 'tipc_node_create' was not declared. Should it be static? Signed-off-by: Wei Yongjun Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 4a95c8c155c6..4fb4327311bb 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -333,8 +333,8 @@ static void tipc_node_write_unlock(struct tipc_node *n) } } -struct tipc_node *tipc_node_create(struct net *net, u32 addr, - u8 *peer_id, u16 capabilities) +static struct tipc_node *tipc_node_create(struct net *net, u32 addr, + u8 *peer_id, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n, *temp_node; -- cgit v1.2.3-70-g09d2 From e32ac25018558f7bcab387708187ab5aa2733cf8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 26 Mar 2018 08:35:01 -0700 Subject: ipv6: addrconf: Use normal debugging style Remove local ADBG macro and use netdev_dbg/pr_debug Miscellanea: o Remove unnecessary debug message after allocation failure as there already is a dump_stack() on the failure paths o Leave the allocation failure message on snmp6_alloc_dev as there is one code path that does not do a dump_stack() Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9ca90cd9fba4..189eac80f4ef 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -94,15 +94,6 @@ #include #include -/* Set to 3 to get tracing... */ -#define ACONF_DEBUG 2 - -#if ACONF_DEBUG >= 3 -#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0) -#endif - #define INFINITY_LIFE_TIME 0xFFFFFFFF #define IPV6_MAX_STRLEN \ @@ -409,9 +400,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev_hold(dev); if (snmp6_alloc_dev(ndev) < 0) { - ADBG(KERN_WARNING - "%s: cannot allocate memory for statistics; dev=%s.\n", - __func__, dev->name); + netdev_dbg(dev, "%s: cannot allocate memory for statistics\n", + __func__); neigh_parms_release(&nd_tbl, ndev->nd_parms); dev_put(dev); kfree(ndev); @@ -419,9 +409,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) } if (snmp6_register_dev(ndev) < 0) { - ADBG(KERN_WARNING - "%s: cannot create /proc/net/dev_snmp6/%s\n", - __func__, dev->name); + netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n", + __func__, dev->name); goto err_release; } @@ -984,7 +973,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) /* Ignore adding duplicate addresses on an interface */ if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) { - ADBG("ipv6_add_addr: already assigned\n"); + netdev_dbg(dev, "ipv6_add_addr: already assigned\n"); err = -EEXIST; } else { hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); @@ -1044,7 +1033,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, ifa = kzalloc(sizeof(*ifa), gfp_flags); if (!ifa) { - ADBG("ipv6_add_addr: malloc failed\n"); err = -ENOBUFS; goto out; } @@ -2618,7 +2606,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) pinfo = (struct prefix_info *) opt; if (len < sizeof(struct prefix_info)) { - ADBG("addrconf: prefix option too short\n"); + netdev_dbg(dev, "addrconf: prefix option too short\n"); return; } @@ -4446,8 +4434,8 @@ restart: if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX; - ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", - now, next, next_sec, next_sched); + pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", + now, next, next_sec, next_sched); mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); rcu_read_unlock_bh(); } -- cgit v1.2.3-70-g09d2 From 094374e5e173c6639eccf6a2af5e1357a0869848 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:01 +0300 Subject: net: Reflect all pernet_operations are converted All pernet_operations are reviewed and converted, hooray! Reflect this in core code: setup_net() and cleanup_net() will take down_read() always. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/net_namespace.c | 43 ++++++------------------------------------- 1 file changed, 6 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 95ba2c53bd9a..0f614523a13f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -40,9 +40,8 @@ struct net init_net = { EXPORT_SYMBOL(init_net); static bool init_net_initialized; -static unsigned nr_sync_pernet_ops; /* - * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops, + * net_sem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. */ DECLARE_RWSEM(net_sem); @@ -406,7 +405,6 @@ struct net *copy_net_ns(unsigned long flags, { struct ucounts *ucounts; struct net *net; - unsigned write; int rv; if (!(flags & CLONE_NEWNET)) @@ -424,25 +422,14 @@ struct net *copy_net_ns(unsigned long flags, refcount_set(&net->passive, 1); net->ucounts = ucounts; get_user_ns(user_ns); -again: - write = READ_ONCE(nr_sync_pernet_ops); - if (write) - rv = down_write_killable(&net_sem); - else - rv = down_read_killable(&net_sem); + + rv = down_read_killable(&net_sem); if (rv < 0) goto put_userns; - if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { - up_read(&net_sem); - goto again; - } rv = setup_net(net, user_ns); - if (write) - up_write(&net_sem); - else - up_read(&net_sem); + up_read(&net_sem); if (rv < 0) { put_userns: @@ -490,21 +477,11 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); - unsigned write; /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); -again: - write = READ_ONCE(nr_sync_pernet_ops); - if (write) - down_write(&net_sem); - else - down_read(&net_sem); - if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) { - up_read(&net_sem); - goto again; - } + down_read(&net_sem); /* Don't let anyone else find us. */ rtnl_lock(); @@ -543,10 +520,7 @@ again: list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - if (write) - up_write(&net_sem); - else - up_read(&net_sem); + up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -1006,9 +980,6 @@ again: rcu_barrier(); if (ops->id) ida_remove(&net_generic_ids, *ops->id); - } else if (!ops->async) { - pr_info_once("Pernet operations %ps are sync.\n", ops); - nr_sync_pernet_ops++; } return error; @@ -1016,8 +987,6 @@ again: static void unregister_pernet_operations(struct pernet_operations *ops) { - if (!ops->async) - BUG_ON(nr_sync_pernet_ops-- == 0); __unregister_pernet_operations(ops); rcu_barrier(); if (ops->id) -- cgit v1.2.3-70-g09d2 From 2f635ceeb22ba13c307236d69795fbb29cfa3e7c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:13 +0300 Subject: net: Drop pernet_operations::async Synchronous pernet_operations are not allowed anymore. All are asynchronous. So, drop the structure member. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/core/cma.c | 1 - drivers/net/bonding/bond_main.c | 1 - drivers/net/geneve.c | 1 - drivers/net/gtp.c | 1 - drivers/net/ipvlan/ipvlan_main.c | 1 - drivers/net/loopback.c | 1 - drivers/net/ppp/ppp_generic.c | 1 - drivers/net/ppp/pppoe.c | 1 - drivers/net/vrf.c | 1 - drivers/net/vxlan.c | 1 - drivers/net/wireless/mac80211_hwsim.c | 1 - fs/lockd/svc.c | 1 - fs/nfs/blocklayout/rpc_pipefs.c | 1 - fs/nfs/dns_resolve.c | 1 - fs/nfs/inode.c | 1 - fs/nfs_common/grace.c | 1 - fs/nfsd/nfsctl.c | 1 - fs/proc/proc_net.c | 1 - include/net/net_namespace.h | 6 ------ kernel/audit.c | 1 - lib/kobject_uevent.c | 1 - net/8021q/vlan.c | 1 - net/bridge/br.c | 1 - net/bridge/br_netfilter_hooks.c | 1 - net/bridge/netfilter/ebtable_broute.c | 1 - net/bridge/netfilter/ebtable_filter.c | 1 - net/bridge/netfilter/ebtable_nat.c | 1 - net/bridge/netfilter/nf_log_bridge.c | 1 - net/caif/caif_dev.c | 1 - net/can/af_can.c | 1 - net/can/bcm.c | 1 - net/can/gw.c | 1 - net/core/dev.c | 2 -- net/core/fib_notifier.c | 1 - net/core/fib_rules.c | 1 - net/core/net-procfs.c | 2 -- net/core/net_namespace.c | 2 -- net/core/pktgen.c | 1 - net/core/rtnetlink.c | 1 - net/core/sock.c | 2 -- net/core/sock_diag.c | 1 - net/core/sysctl_net_core.c | 1 - net/dccp/ipv4.c | 1 - net/dccp/ipv6.c | 1 - net/ieee802154/6lowpan/reassembly.c | 1 - net/ieee802154/core.c | 1 - net/ipv4/af_inet.c | 2 -- net/ipv4/arp.c | 1 - net/ipv4/devinet.c | 1 - net/ipv4/fib_frontend.c | 1 - net/ipv4/fou.c | 1 - net/ipv4/icmp.c | 1 - net/ipv4/igmp.c | 1 - net/ipv4/ip_fragment.c | 1 - net/ipv4/ip_gre.c | 3 --- net/ipv4/ip_vti.c | 1 - net/ipv4/ipip.c | 1 - net/ipv4/ipmr.c | 1 - net/ipv4/netfilter/arp_tables.c | 1 - net/ipv4/netfilter/arptable_filter.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 - net/ipv4/netfilter/iptable_filter.c | 1 - net/ipv4/netfilter/iptable_mangle.c | 1 - net/ipv4/netfilter/iptable_nat.c | 1 - net/ipv4/netfilter/iptable_raw.c | 1 - net/ipv4/netfilter/iptable_security.c | 1 - net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 - net/ipv4/netfilter/nf_defrag_ipv4.c | 1 - net/ipv4/netfilter/nf_log_arp.c | 1 - net/ipv4/netfilter/nf_log_ipv4.c | 1 - net/ipv4/ping.c | 1 - net/ipv4/proc.c | 1 - net/ipv4/raw.c | 1 - net/ipv4/route.c | 4 ---- net/ipv4/sysctl_net_ipv4.c | 1 - net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_metrics.c | 1 - net/ipv4/udp.c | 2 -- net/ipv4/udplite.c | 1 - net/ipv4/xfrm4_policy.c | 1 - net/ipv6/addrconf.c | 2 -- net/ipv6/addrlabel.c | 1 - net/ipv6/af_inet6.c | 1 - net/ipv6/fib6_rules.c | 1 - net/ipv6/icmp.c | 1 - net/ipv6/ila/ila_xlat.c | 1 - net/ipv6/ip6_fib.c | 1 - net/ipv6/ip6_flowlabel.c | 1 - net/ipv6/ip6_gre.c | 1 - net/ipv6/ip6_tunnel.c | 1 - net/ipv6/ip6_vti.c | 1 - net/ipv6/ip6mr.c | 1 - net/ipv6/mcast.c | 1 - net/ipv6/ndisc.c | 1 - net/ipv6/netfilter/ip6_tables.c | 1 - net/ipv6/netfilter/ip6table_filter.c | 1 - net/ipv6/netfilter/ip6table_mangle.c | 1 - net/ipv6/netfilter/ip6table_nat.c | 1 - net/ipv6/netfilter/ip6table_raw.c | 1 - net/ipv6/netfilter/ip6table_security.c | 1 - net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 - net/ipv6/netfilter/nf_conntrack_reasm.c | 1 - net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 - net/ipv6/netfilter/nf_log_ipv6.c | 1 - net/ipv6/ping.c | 1 - net/ipv6/proc.c | 1 - net/ipv6/raw.c | 1 - net/ipv6/reassembly.c | 1 - net/ipv6/route.c | 3 --- net/ipv6/seg6.c | 1 - net/ipv6/sit.c | 1 - net/ipv6/sysctl_net_ipv6.c | 1 - net/ipv6/tcp_ipv6.c | 1 - net/ipv6/udplite.c | 1 - net/ipv6/xfrm6_policy.c | 1 - net/ipv6/xfrm6_tunnel.c | 1 - net/kcm/kcmproc.c | 1 - net/kcm/kcmsock.c | 1 - net/key/af_key.c | 1 - net/l2tp/l2tp_core.c | 1 - net/l2tp/l2tp_ppp.c | 1 - net/mpls/af_mpls.c | 1 - net/netfilter/core.c | 1 - net/netfilter/ipset/ip_set_core.c | 1 - net/netfilter/ipvs/ip_vs_core.c | 2 -- net/netfilter/ipvs/ip_vs_ftp.c | 1 - net/netfilter/ipvs/ip_vs_lblc.c | 1 - net/netfilter/ipvs/ip_vs_lblcr.c | 1 - net/netfilter/nf_conntrack_netlink.c | 1 - net/netfilter/nf_conntrack_proto_gre.c | 1 - net/netfilter/nf_conntrack_standalone.c | 1 - net/netfilter/nf_log.c | 1 - net/netfilter/nf_log_netdev.c | 1 - net/netfilter/nf_synproxy_core.c | 1 - net/netfilter/nf_tables_api.c | 1 - net/netfilter/nfnetlink.c | 1 - net/netfilter/nfnetlink_acct.c | 1 - net/netfilter/nfnetlink_cttimeout.c | 1 - net/netfilter/nfnetlink_log.c | 1 - net/netfilter/nfnetlink_queue.c | 1 - net/netfilter/x_tables.c | 1 - net/netfilter/xt_hashlimit.c | 1 - net/netfilter/xt_recent.c | 1 - net/netlink/af_netlink.c | 2 -- net/netlink/genetlink.c | 1 - net/openvswitch/datapath.c | 1 - net/packet/af_packet.c | 1 - net/phonet/pn_dev.c | 1 - net/rds/tcp.c | 1 - net/rxrpc/net_ns.c | 1 - net/sched/act_api.c | 1 - net/sched/act_bpf.c | 1 - net/sched/act_connmark.c | 1 - net/sched/act_csum.c | 1 - net/sched/act_gact.c | 1 - net/sched/act_ife.c | 1 - net/sched/act_ipt.c | 2 -- net/sched/act_mirred.c | 1 - net/sched/act_nat.c | 1 - net/sched/act_pedit.c | 1 - net/sched/act_police.c | 1 - net/sched/act_sample.c | 1 - net/sched/act_simple.c | 1 - net/sched/act_skbedit.c | 1 - net/sched/act_skbmod.c | 1 - net/sched/act_tunnel_key.c | 1 - net/sched/act_vlan.c | 1 - net/sched/cls_api.c | 1 - net/sched/sch_api.c | 1 - net/sctp/protocol.c | 2 -- net/sunrpc/auth_gss/auth_gss.c | 1 - net/sunrpc/sunrpc_syms.c | 1 - net/sysctl_net.c | 1 - net/tipc/core.c | 1 - net/unix/af_unix.c | 1 - net/wireless/core.c | 1 - net/wireless/wext-core.c | 1 - net/xfrm/xfrm_policy.c | 1 - net/xfrm/xfrm_user.c | 1 - security/selinux/hooks.c | 1 - security/smack/smack_netfilter.c | 1 - 182 files changed, 206 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 66f203730e80..6ab1059fed66 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4554,7 +4554,6 @@ static struct pernet_operations cma_pernet_operations = { .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), - .async = true, }; static int __init cma_init(void) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4c19d23dd282..c669554d70bb 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4791,7 +4791,6 @@ static struct pernet_operations bond_net_ops = { .exit = bond_net_exit, .id = &bond_net_id, .size = sizeof(struct bond_net), - .async = true, }; static int __init bonding_init(void) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 516dd59249d7..b919e89a9b93 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1694,7 +1694,6 @@ static struct pernet_operations geneve_net_ops = { .exit_batch = geneve_exit_batch_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), - .async = true, }; static int __init geneve_init_module(void) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 127edd23018f..f38e32a7ec9c 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1325,7 +1325,6 @@ static struct pernet_operations gtp_net_ops = { .exit = gtp_net_exit, .id = >p_net_id, .size = sizeof(struct gtp_net), - .async = true, }; static int __init gtp_init(void) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 743d37fb034a..450eec264a5e 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -1040,7 +1040,6 @@ static struct pernet_operations ipvlan_net_ops = { .id = &ipvlan_netid, .size = sizeof(struct ipvlan_netns), .exit = ipvlan_ns_exit, - .async = true, }; static int __init ipvlan_init_module(void) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b97a907ea5aa..30612497643c 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -230,5 +230,4 @@ out: /* Registered in net/core/dev.c */ struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, - .async = true, }; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22fcff3c7a9a..dc7c7ec43202 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -970,7 +970,6 @@ static struct pernet_operations ppp_net_ops = { .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), - .async = true, }; static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index f9552a400271..1483bc7b01e1 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -1161,7 +1161,6 @@ static struct pernet_operations pppoe_net_ops = { .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), - .async = true, }; static int __init pppoe_init(void) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index c6be49d3a9eb..102582459bef 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1435,7 +1435,6 @@ static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .id = &vrf_net_id, .size = sizeof(bool), - .async = true, }; static int __init vrf_init_module(void) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aa5f034d6ad1..fab7a4db249e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3752,7 +3752,6 @@ static struct pernet_operations vxlan_net_ops = { .exit_batch = vxlan_exit_batch_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), - .async = true, }; static int __init vxlan_init_module(void) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 100cf42db65d..a37f4b1d9d30 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3542,7 +3542,6 @@ static struct pernet_operations hwsim_net_ops = { .exit = hwsim_exit_net, .id = &hwsim_net_id, .size = sizeof(struct hwsim_net), - .async = true, }; static void hwsim_exit_netlink(void) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2dee4e03ff1c..9c36d614bf89 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -709,7 +709,6 @@ static struct pernet_operations lockd_net_ops = { .exit = lockd_exit_net, .id = &lockd_net_id, .size = sizeof(struct lockd_net), - .async = true, }; diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index ef9fa111b009..9fb067a6f7e0 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -261,7 +261,6 @@ static void nfs4blocklayout_net_exit(struct net *net) static struct pernet_operations nfs4blocklayout_net_ops = { .init = nfs4blocklayout_net_init, .exit = nfs4blocklayout_net_exit, - .async = true, }; int __init bl_init_pipefs(void) diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index e90bd69ab653..060c658eab66 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -410,7 +410,6 @@ static void nfs4_dns_net_exit(struct net *net) static struct pernet_operations nfs4_dns_resolver_ops = { .init = nfs4_dns_net_init, .exit = nfs4_dns_net_exit, - .async = true, }; static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6c3083c992e5..7d893543cf3b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2122,7 +2122,6 @@ static struct pernet_operations nfs_net_ops = { .exit = nfs_net_exit, .id = &nfs_net_id, .size = sizeof(struct nfs_net), - .async = true, }; /* diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 8c743a405df6..5be08f02a76b 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -118,7 +118,6 @@ static struct pernet_operations grace_net_ops = { .exit = grace_exit_net, .id = &grace_net_id, .size = sizeof(struct list_head), - .async = true, }; static int __init diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1e3824e6cce0..d107b4426f7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1263,7 +1263,6 @@ static struct pernet_operations nfsd_net_ops = { .exit = nfsd_exit_net, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), - .async = true, }; static int __init init_nfsd(void) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index da6f8733c9c5..68c06ae7888c 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -237,7 +237,6 @@ static __net_exit void proc_net_ns_exit(struct net *net) static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, - .async = true, }; int __init proc_net_init(void) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 09e30bdc7876..37bcf8382b61 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -333,12 +333,6 @@ struct pernet_operations { void (*exit_batch)(struct list_head *net_exit_list); unsigned int *id; size_t size; - /* - * Indicates above methods are allowed to be executed in parallel - * with methods of any other pernet_operations, i.e. they are not - * need write locked net_sem. - */ - bool async; }; /* diff --git a/kernel/audit.c b/kernel/audit.c index 5e49b614d0e6..227db99b0f19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1526,7 +1526,6 @@ static struct pernet_operations audit_net_ops __net_initdata = { .exit = audit_net_exit, .id = &audit_net_id, .size = sizeof(struct audit_net), - .async = true, }; /* Initialize audit support at boot time. */ diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index fa10ad8e9b17..15ea216a67ce 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -724,7 +724,6 @@ static void uevent_net_exit(struct net *net) static struct pernet_operations uevent_net_ops = { .init = uevent_net_init, .exit = uevent_net_exit, - .async = true, }; static int __init kobject_uevent_init(void) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index bd0ed39f65fb..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -729,7 +729,6 @@ static struct pernet_operations vlan_net_ops = { .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), - .async = true, }; static int __init vlan_proto_init(void) diff --git a/net/bridge/br.c b/net/bridge/br.c index a3f95ab9d6a3..26e1616b2c90 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -188,7 +188,6 @@ static void __net_exit br_net_exit(struct net *net) static struct pernet_operations br_net_ops = { .exit = br_net_exit, - .async = true, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c2120eb889a9..9b16eaf33819 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -969,7 +969,6 @@ static struct pernet_operations brnf_net_ops __read_mostly = { .exit = brnf_exit_net, .id = &brnf_net_id, .size = sizeof(struct brnf_net), - .async = true, }; static struct notifier_block brnf_notifier __read_mostly = { diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index f070b5e5b9dd..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -77,7 +77,6 @@ static void __net_exit broute_net_exit(struct net *net) static struct pernet_operations broute_net_ops = { .init = broute_net_init, .exit = broute_net_exit, - .async = true, }; static int __init ebtable_broute_init(void) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 4151afc8efcc..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -105,7 +105,6 @@ static void __net_exit frame_filter_net_exit(struct net *net) static struct pernet_operations frame_filter_net_ops = { .init = frame_filter_net_init, .exit = frame_filter_net_exit, - .async = true, }; static int __init ebtable_filter_init(void) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index b8da2dfe2ec5..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -105,7 +105,6 @@ static void __net_exit frame_nat_net_exit(struct net *net) static struct pernet_operations frame_nat_net_ops = { .init = frame_nat_net_init, .exit = frame_nat_net_exit, - .async = true, }; static int __init ebtable_nat_init(void) diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c index 91bfc2ac055a..bd2b3c78f59b 100644 --- a/net/bridge/netfilter/nf_log_bridge.c +++ b/net/bridge/netfilter/nf_log_bridge.c @@ -48,7 +48,6 @@ static void __net_exit nf_log_bridge_net_exit(struct net *net) static struct pernet_operations nf_log_bridge_net_ops = { .init = nf_log_bridge_net_init, .exit = nf_log_bridge_net_exit, - .async = true, }; static int __init nf_log_bridge_init(void) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 7a78268cc572..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -544,7 +544,6 @@ static struct pernet_operations caif_net_ops = { .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), - .async = true, }; /* Initialize Caif devices list */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 2f0d0a72e4b5..1684ba5b51eb 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -954,7 +954,6 @@ static struct notifier_block can_netdev_notifier __read_mostly = { static struct pernet_operations can_pernet_ops __read_mostly = { .init = can_pernet_init, .exit = can_pernet_exit, - .async = true, }; static __init int can_init(void) diff --git a/net/can/bcm.c b/net/can/bcm.c index 26730d39e048..ac5e5e34fee3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1717,7 +1717,6 @@ static void canbcm_pernet_exit(struct net *net) static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, - .async = true, }; static int __init bcm_module_init(void) diff --git a/net/can/gw.c b/net/can/gw.c index 8d71e199d5b3..faa3da88a127 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1010,7 +1010,6 @@ static void __net_exit cangw_pernet_exit(struct net *net) static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit = cangw_pernet_exit, - .async = true, }; static __init int cgw_module_init(void) diff --git a/net/core/dev.c b/net/core/dev.c index 97a96df4b6da..e13807b5c84d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8883,7 +8883,6 @@ static void __net_exit netdev_exit(struct net *net) static struct pernet_operations __net_initdata netdev_net_ops = { .init = netdev_init, .exit = netdev_exit, - .async = true, }; static void __net_exit default_device_exit(struct net *net) @@ -8984,7 +8983,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) static struct pernet_operations __net_initdata default_device_ops = { .exit = default_device_exit, .exit_batch = default_device_exit_batch, - .async = true, }; /* diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 5ace0705a3f9..0c048bdeb016 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -171,7 +171,6 @@ static void __net_exit fib_notifier_net_exit(struct net *net) static struct pernet_operations fib_notifier_net_ops = { .init = fib_notifier_net_init, .exit = fib_notifier_net_exit, - .async = true, }; static int __init fib_notifier_init(void) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index f6f04fc0f629..9d87ce868402 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1130,7 +1130,6 @@ static void __net_exit fib_rules_net_exit(struct net *net) static struct pernet_operations fib_rules_net_ops = { .init = fib_rules_net_init, .exit = fib_rules_net_exit, - .async = true, }; static int __init fib_rules_init(void) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index dd6ae431d038..9737302907b1 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -349,7 +349,6 @@ static void __net_exit dev_proc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_proc_ops = { .init = dev_proc_net_init, .exit = dev_proc_net_exit, - .async = true, }; static int dev_mc_seq_show(struct seq_file *seq, void *v) @@ -406,7 +405,6 @@ static void __net_exit dev_mc_net_exit(struct net *net) static struct pernet_operations __net_initdata dev_mc_net_ops = { .init = dev_mc_net_init, .exit = dev_mc_net_exit, - .async = true, }; int __init dev_proc_init(void) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0f614523a13f..eef17ad29dea 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -338,7 +338,6 @@ static int __net_init net_defaults_init_net(struct net *net) static struct pernet_operations net_defaults_ops = { .init = net_defaults_init_net, - .async = true, }; static __init int net_defaults_init(void) @@ -628,7 +627,6 @@ static __net_exit void net_ns_net_exit(struct net *net) static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, .exit = net_ns_net_exit, - .async = true, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 545cf08cd558..7e4ede34cc52 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3852,7 +3852,6 @@ static struct pernet_operations pg_net_ops = { .exit = pg_net_exit, .id = &pg_net_id, .size = sizeof(struct pktgen_net), - .async = true, }; static int __init pg_init(void) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 87079eaa871b..31438b63d4b4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4730,7 +4730,6 @@ static void __net_exit rtnetlink_net_exit(struct net *net) static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, - .async = true, }; void __init rtnetlink_init(void) diff --git a/net/core/sock.c b/net/core/sock.c index 8cee2920a47f..6444525f610c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3175,7 +3175,6 @@ static void __net_exit sock_inuse_exit_net(struct net *net) static struct pernet_operations net_inuse_ops = { .init = sock_inuse_init_net, .exit = sock_inuse_exit_net, - .async = true, }; static __init int net_inuse_init(void) @@ -3470,7 +3469,6 @@ static __net_exit void proto_exit_net(struct net *net) static __net_initdata struct pernet_operations proto_net_ops = { .init = proto_init_net, .exit = proto_exit_net, - .async = true, }; static int __init proto_init(void) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a3392a8f9276..c37b5be7c5e4 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -324,7 +324,6 @@ static void __net_exit diag_net_exit(struct net *net) static struct pernet_operations diag_net_ops = { .init = diag_net_init, .exit = diag_net_exit, - .async = true, }; static int __init sock_diag_init(void) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4f47f92459cc..b3b609f0eeb5 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -584,7 +584,6 @@ static __net_exit void sysctl_core_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, - .async = true, }; static __init int sysctl_core_init(void) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 13ad28ab1e79..e65fcb45c3f6 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1031,7 +1031,6 @@ static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, - .async = true, }; static int __init dccp_v4_init(void) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2f48c020f8c3..5df7857fc0f3 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1116,7 +1116,6 @@ static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, - .async = true, }; static int __init dccp_v6_init(void) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index a9ccb1322f69..85bf86ad6b18 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -603,7 +603,6 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .exit = lowpan_frags_exit_net, - .async = true, }; int __init lowpan_net_frag_init(void) diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 9104943c15ba..cb7176cd4cd6 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -345,7 +345,6 @@ static void __net_exit cfg802154_pernet_exit(struct net *net) static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, - .async = true, }; static int __init wpan_phy_class_init(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e8c7fad8c329..f98e2f0db841 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1735,7 +1735,6 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, - .async = true, }; static int __init init_ipv4_mibs(void) @@ -1789,7 +1788,6 @@ static __net_exit void inet_exit_net(struct net *net) static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, .exit = inet_exit_net, - .async = true, }; static int __init init_inet_pernet_ops(void) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4c6ba0fb2630..be4c595edccb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1447,7 +1447,6 @@ static void __net_exit arp_net_exit(struct net *net) static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, - .async = true, }; static int __init arp_proc_init(void) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5ae0d1f097ca..40f001782c1b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2469,7 +2469,6 @@ static __net_exit void devinet_exit_net(struct net *net) static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, - .async = true, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index ac71c3d496c0..f05afaf3235c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1362,7 +1362,6 @@ static void __net_exit fib_net_exit(struct net *net) static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, - .async = true, }; void __init ip_fib_init(void) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index d3e1a9af478b..1540db65241a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -1081,7 +1081,6 @@ static struct pernet_operations fou_net_ops = { .exit = fou_exit_net, .id = &fou_net_id, .size = sizeof(struct fou_net), - .async = true, }; static int __init fou_init(void) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cc56efa64d5c..1617604c9284 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1257,7 +1257,6 @@ fail: static struct pernet_operations __net_initdata icmp_sk_ops = { .init = icmp_sk_init, .exit = icmp_sk_exit, - .async = true, }; int __init icmp_init(void) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index f17cd83ba164..b26a81a7de42 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -3028,7 +3028,6 @@ static void __net_exit igmp_net_exit(struct net *net) static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, - .async = true, }; #endif diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 5e843ae5e468..bbf1b94942c0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -885,7 +885,6 @@ static void __net_exit ipv4_frags_exit_net(struct net *net) static struct pernet_operations ip4_frags_ops = { .init = ipv4_frags_init_net, .exit = ipv4_frags_exit_net, - .async = true, }; void __init ipfrag_init(void) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9ab1aa2f7660..a8772a978224 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1044,7 +1044,6 @@ static struct pernet_operations ipgre_net_ops = { .exit_batch = ipgre_exit_batch_net, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1628,7 +1627,6 @@ static struct pernet_operations ipgre_tap_net_ops = { .exit_batch = ipgre_tap_exit_batch_net, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __net_init erspan_init_net(struct net *net) @@ -1647,7 +1645,6 @@ static struct pernet_operations erspan_net_ops = { .exit_batch = erspan_exit_batch_net, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __init ipgre_init(void) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index b10bf563afd9..51b1669334fe 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -454,7 +454,6 @@ static struct pernet_operations vti_net_ops = { .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9c5a4d164f09..c891235b4966 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -669,7 +669,6 @@ static struct pernet_operations ipip_net_ops = { .exit_batch = ipip_exit_batch_net, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), - .async = true, }; static int __init ipip_init(void) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e79211a8537c..2fb4de3f7f66 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3009,7 +3009,6 @@ static void __net_exit ipmr_net_exit(struct net *net) static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, - .async = true, }; int __init ip_mr_init(void) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c36ffce3c812..e3e420f3ba7b 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1635,7 +1635,6 @@ static void __net_exit arp_tables_net_exit(struct net *net) static struct pernet_operations arp_tables_net_ops = { .init = arp_tables_net_init, .exit = arp_tables_net_exit, - .async = true, }; static int __init arp_tables_init(void) diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 49c2490193ae..8f8713b4388f 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -65,7 +65,6 @@ static void __net_exit arptable_filter_net_exit(struct net *net) static struct pernet_operations arptable_filter_net_ops = { .exit = arptable_filter_net_exit, - .async = true, }; static int __init arptable_filter_init(void) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d4f7584d2dbe..e38395a8dcf2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1916,7 +1916,6 @@ static void __net_exit ip_tables_net_exit(struct net *net) static struct pernet_operations ip_tables_net_ops = { .init = ip_tables_net_init, .exit = ip_tables_net_exit, - .async = true, }; static int __init ip_tables_init(void) diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 31b4cca588d0..2c8d313ae216 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -845,7 +845,6 @@ static struct pernet_operations clusterip_net_ops = { .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), - .async = true, }; static int __init clusterip_tg_init(void) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index c1c136a93911..9ac92ea7b93c 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -87,7 +87,6 @@ static void __net_exit iptable_filter_net_exit(struct net *net) static struct pernet_operations iptable_filter_net_ops = { .init = iptable_filter_net_init, .exit = iptable_filter_net_exit, - .async = true, }; static int __init iptable_filter_init(void) diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index f6074059531a..dea138ca8925 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -113,7 +113,6 @@ static void __net_exit iptable_mangle_net_exit(struct net *net) static struct pernet_operations iptable_mangle_net_ops = { .exit = iptable_mangle_net_exit, - .async = true, }; static int __init iptable_mangle_init(void) diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index b771af74be79..0f7255cc65ee 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -129,7 +129,6 @@ static void __net_exit iptable_nat_net_exit(struct net *net) static struct pernet_operations iptable_nat_net_ops = { .exit = iptable_nat_net_exit, - .async = true, }; static int __init iptable_nat_init(void) diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 963753e50842..960625aabf04 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -76,7 +76,6 @@ static void __net_exit iptable_raw_net_exit(struct net *net) static struct pernet_operations iptable_raw_net_ops = { .exit = iptable_raw_net_exit, - .async = true, }; static int __init iptable_raw_init(void) diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c40d6b3d8b6a..e5379fe57b64 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -76,7 +76,6 @@ static void __net_exit iptable_security_net_exit(struct net *net) static struct pernet_operations iptable_security_net_ops = { .exit = iptable_security_net_exit, - .async = true, }; static int __init iptable_security_init(void) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 6531f69db010..b50721d9d30e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -399,7 +399,6 @@ static struct pernet_operations ipv4_net_ops = { .exit = ipv4_net_exit, .id = &conntrack4_net_id, .size = sizeof(struct conntrack4_net), - .async = true, }; static int __init nf_conntrack_l3proto_ipv4_init(void) diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 57244b62a4fc..a0d3ad60a411 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -118,7 +118,6 @@ static void __net_exit defrag4_net_exit(struct net *net) static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .async = true, }; static int __init nf_defrag_init(void) diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 162293469ac2..df5c2a2061a4 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -122,7 +122,6 @@ static void __net_exit nf_log_arp_net_exit(struct net *net) static struct pernet_operations nf_log_arp_net_ops = { .init = nf_log_arp_net_init, .exit = nf_log_arp_net_exit, - .async = true, }; static int __init nf_log_arp_init(void) diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 7a06de140f3c..4388de0e5380 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -358,7 +358,6 @@ static void __net_exit nf_log_ipv4_net_exit(struct net *net) static struct pernet_operations nf_log_ipv4_net_ops = { .init = nf_log_ipv4_net_init, .exit = nf_log_ipv4_net_exit, - .async = true, }; static int __init nf_log_ipv4_init(void) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 1f24bc8273a0..05e47d777009 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1204,7 +1204,6 @@ static void __net_exit ping_v4_proc_exit_net(struct net *net) static struct pernet_operations ping_v4_net_ops = { .init = ping_v4_proc_init_net, .exit = ping_v4_proc_exit_net, - .async = true, }; int __init ping_proc_init(void) diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 80de2e659dcb..adfb75340275 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -549,7 +549,6 @@ static __net_exit void ip_proc_exit_net(struct net *net) static __net_initdata struct pernet_operations ip_proc_ops = { .init = ip_proc_init_net, .exit = ip_proc_exit_net, - .async = true, }; int __init ip_misc_proc_init(void) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0ee2501a9027..1b4d3355624a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1154,7 +1154,6 @@ static __net_exit void raw_exit_net(struct net *net) static __net_initdata struct pernet_operations raw_net_ops = { .init = raw_init_net, .exit = raw_exit_net, - .async = true, }; int __init raw_proc_init(void) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce9bd5380d21..8322e479f299 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -418,7 +418,6 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, - .async = true, }; static int __init ip_rt_proc_init(void) @@ -3017,7 +3016,6 @@ static __net_exit void sysctl_route_net_exit(struct net *net) static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, - .async = true, }; #endif @@ -3031,7 +3029,6 @@ static __net_init int rt_genid_init(struct net *net) static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, - .async = true, }; static int __net_init ipv4_inetpeer_init(struct net *net) @@ -3057,7 +3054,6 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net) static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, - .async = true, }; #ifdef CONFIG_IP_ROUTE_CLASSID diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5b72d97693f8..4b195bac8ac0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1219,7 +1219,6 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) static __net_initdata struct pernet_operations ipv4_sysctl_ops = { .init = ipv4_sysctl_init_net, .exit = ipv4_sysctl_exit_net, - .async = true, }; static __init int sysctl_ipv4_init(void) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fec8b1fd7b63..9639334ebb7c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2391,7 +2391,6 @@ static void __net_exit tcp4_proc_exit_net(struct net *net) static struct pernet_operations tcp4_net_ops = { .init = tcp4_proc_init_net, .exit = tcp4_proc_exit_net, - .async = true, }; int __init tcp4_proc_init(void) @@ -2578,7 +2577,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { .init = tcp_sk_init, .exit = tcp_sk_exit, .exit_batch = tcp_sk_exit_batch, - .async = true, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index aa6fea9f3328..03b51cdcc731 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1024,7 +1024,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .init = tcp_net_metrics_init, .exit_batch = tcp_net_metrics_exit_batch, - .async = true, }; void __init tcp_metrics_init(void) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fb8f3a36bd14..f49e14cd3891 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2756,7 +2756,6 @@ static void __net_exit udp4_proc_exit_net(struct net *net) static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, - .async = true, }; int __init udp4_proc_init(void) @@ -2843,7 +2842,6 @@ static int __net_init udp_sysctl_init(struct net *net) static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_sysctl_init, - .async = true, }; void __init udp_init(void) diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 72f2c3806408..f96614e9b9a5 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -104,7 +104,6 @@ static void __net_exit udplite4_proc_exit_net(struct net *net) static struct pernet_operations udplite4_net_ops = { .init = udplite4_proc_init_net, .exit = udplite4_proc_exit_net, - .async = true, }; static __init int udplite4_proc_init(void) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6c76a757fa4a..d73a6d6652f6 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -367,7 +367,6 @@ static void __net_exit xfrm4_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm4_net_ops = { .init = xfrm4_net_init, .exit = xfrm4_net_exit, - .async = true, }; static void __init xfrm4_policy_init(void) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 189eac80f4ef..78cef00c9596 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4282,7 +4282,6 @@ static void __net_exit if6_proc_net_exit(struct net *net) static struct pernet_operations if6_proc_net_ops = { .init = if6_proc_net_init, .exit = if6_proc_net_exit, - .async = true, }; int __init if6_proc_init(void) @@ -6592,7 +6591,6 @@ static void __net_exit addrconf_exit_net(struct net *net) static struct pernet_operations addrconf_ops = { .init = addrconf_init_net, .exit = addrconf_exit_net, - .async = true, }; static struct rtnl_af_ops inet6_ops __read_mostly = { diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ba2e63633370..1d6ced37ad71 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -344,7 +344,6 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) static struct pernet_operations ipv6_addr_label_ops = { .init = ip6addrlbl_net_init, .exit = ip6addrlbl_net_exit, - .async = true, }; int __init ipv6_addr_label_init(void) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dbbe04018813..c1e292db04db 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,6 @@ static void __net_exit inet6_net_exit(struct net *net) static struct pernet_operations inet6_net_ops = { .init = inet6_net_init, .exit = inet6_net_exit, - .async = true, }; static const struct ipv6_stub ipv6_stub_impl = { diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 00ef9467f3c0..df113c7b5fc8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -397,7 +397,6 @@ static void __net_exit fib6_rules_net_exit(struct net *net) static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit = fib6_rules_net_exit, - .async = true, }; int __init fib6_rules_init(void) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6f84668be6ea..d8c4b6374377 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -998,7 +998,6 @@ static void __net_exit icmpv6_sk_exit(struct net *net) static struct pernet_operations icmpv6_sk_ops = { .init = icmpv6_sk_init, .exit = icmpv6_sk_exit, - .async = true, }; int __init icmpv6_init(void) diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index e438699f000f..44c39c5f0638 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -613,7 +613,6 @@ static struct pernet_operations ila_net_ops = { .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), - .async = true, }; static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2f995e9e3050..908b8e5b615a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2161,7 +2161,6 @@ static void fib6_net_exit(struct net *net) static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, - .async = true, }; int __init fib6_init(void) diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index f75b06ba8325..c05c4e82a7ca 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -873,7 +873,6 @@ static void __net_exit ip6_flowlabel_net_exit(struct net *net) static struct pernet_operations ip6_flowlabel_net_ops = { .init = ip6_flowlabel_proc_init, .exit = ip6_flowlabel_net_exit, - .async = true, }; int ip6_flowlabel_init(void) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3a98c694da5f..22e86557aca4 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1528,7 +1528,6 @@ static struct pernet_operations ip6gre_net_ops = { .exit_batch = ip6gre_exit_batch_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), - .async = true, }; static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 456fcf942f95..df4c29f7d59f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2260,7 +2260,6 @@ static struct pernet_operations ip6_tnl_net_ops = { .exit_batch = ip6_tnl_exit_batch_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), - .async = true, }; /** diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index a482b854eeea..60b771f49fb5 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1148,7 +1148,6 @@ static struct pernet_operations vti6_net_ops = { .exit_batch = vti6_exit_batch_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), - .async = true, }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 1c8fa29d155a..298fd8b6ed17 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1348,7 +1348,6 @@ static void __net_exit ip6mr_net_exit(struct net *net) static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, - .async = true, }; int __init ip6_mr_init(void) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1e4c2b6ebd78..793159d77d8a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2997,7 +2997,6 @@ static void __net_exit igmp6_net_exit(struct net *net) static struct pernet_operations igmp6_net_ops = { .init = igmp6_net_init, .exit = igmp6_net_exit, - .async = true, }; int __init igmp6_init(void) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d1d0b2fa7a07..9de4dfb126ba 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1883,7 +1883,6 @@ static void __net_exit ndisc_net_exit(struct net *net) static struct pernet_operations ndisc_net_ops = { .init = ndisc_net_init, .exit = ndisc_net_exit, - .async = true, }; int __init ndisc_init(void) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4de8ac1e5af4..62358b93bbac 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1928,7 +1928,6 @@ static void __net_exit ip6_tables_net_exit(struct net *net) static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, - .async = true, }; static int __init ip6_tables_init(void) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 06561c84c0bc..1343077dde93 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -87,7 +87,6 @@ static void __net_exit ip6table_filter_net_exit(struct net *net) static struct pernet_operations ip6table_filter_net_ops = { .init = ip6table_filter_net_init, .exit = ip6table_filter_net_exit, - .async = true, }; static int __init ip6table_filter_init(void) diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index a11e25936b45..b0524b18c4fb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -107,7 +107,6 @@ static void __net_exit ip6table_mangle_net_exit(struct net *net) static struct pernet_operations ip6table_mangle_net_ops = { .exit = ip6table_mangle_net_exit, - .async = true, }; static int __init ip6table_mangle_init(void) diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 4475fd300bb6..47306e45a80a 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -131,7 +131,6 @@ static void __net_exit ip6table_nat_net_exit(struct net *net) static struct pernet_operations ip6table_nat_net_ops = { .exit = ip6table_nat_net_exit, - .async = true, }; static int __init ip6table_nat_init(void) diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index a88f3b1995b1..710fa0806c37 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -75,7 +75,6 @@ static void __net_exit ip6table_raw_net_exit(struct net *net) static struct pernet_operations ip6table_raw_net_ops = { .exit = ip6table_raw_net_exit, - .async = true, }; static int __init ip6table_raw_init(void) diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 320048c008dc..cf26ccb04056 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -74,7 +74,6 @@ static void __net_exit ip6table_security_net_exit(struct net *net) static struct pernet_operations ip6table_security_net_ops = { .exit = ip6table_security_net_exit, - .async = true, }; static int __init ip6table_security_init(void) diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index ba54bb3bd1e4..663827ee3cf8 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -401,7 +401,6 @@ static struct pernet_operations ipv6_net_ops = { .exit = ipv6_net_exit, .id = &conntrack6_net_id, .size = sizeof(struct conntrack6_net), - .async = true, }; static int __init nf_conntrack_l3proto_ipv6_init(void) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 34136fe80ed5..b84ce3e6d728 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -646,7 +646,6 @@ static void nf_ct_net_exit(struct net *net) static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .exit = nf_ct_net_exit, - .async = true, }; int nf_ct_frag6_init(void) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 32f98bc06900..c87b48359e8f 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -103,7 +103,6 @@ static void __net_exit defrag6_net_exit(struct net *net) static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, - .async = true, }; static int __init nf_defrag_init(void) diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index 0220e584589c..b397a8fe88b9 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -390,7 +390,6 @@ static void __net_exit nf_log_ipv6_net_exit(struct net *net) static struct pernet_operations nf_log_ipv6_net_ops = { .init = nf_log_ipv6_net_init, .exit = nf_log_ipv6_net_exit, - .async = true, }; static int __init nf_log_ipv6_init(void) diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 318c6e914234..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -240,7 +240,6 @@ static void __net_init ping_v6_proc_exit_net(struct net *net) static struct pernet_operations ping_v6_net_ops = { .init = ping_v6_proc_init_net, .exit = ping_v6_proc_exit_net, - .async = true, }; #endif diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index f9891fa672f3..6e57028d2e91 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -343,7 +343,6 @@ static void __net_exit ipv6_proc_exit_net(struct net *net) static struct pernet_operations ipv6_proc_ops = { .init = ipv6_proc_init_net, .exit = ipv6_proc_exit_net, - .async = true, }; int __init ipv6_misc_proc_init(void) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b5e5de732494..5eb9b08947ed 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1332,7 +1332,6 @@ static void __net_exit raw6_exit_net(struct net *net) static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, - .async = true, }; int __init raw6_proc_init(void) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b5da69c83123..afbc000ad4f2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -733,7 +733,6 @@ static void __net_exit ipv6_frags_exit_net(struct net *net) static struct pernet_operations ip6_frags_ops = { .init = ipv6_frags_init_net, .exit = ipv6_frags_exit_net, - .async = true, }; int __init ipv6_frag_init(void) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1d0eaa69874d..ba8d5df50ebe 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5083,7 +5083,6 @@ static void __net_exit ip6_route_net_exit_late(struct net *net) static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, - .async = true, }; static int __net_init ipv6_inetpeer_init(struct net *net) @@ -5109,13 +5108,11 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net) static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, - .async = true, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, - .async = true, }; static struct notifier_block ip6_route_dev_notifier = { diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index c3f13c3bd8a9..7f5621d09571 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -395,7 +395,6 @@ static void __net_exit seg6_net_exit(struct net *net) static struct pernet_operations ip6_segments_ops = { .init = seg6_net_init, .exit = seg6_net_exit, - .async = true, }; static const struct genl_ops seg6_genl_ops[] = { diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8a4f8fddd812..1522bcfd253f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1888,7 +1888,6 @@ static struct pernet_operations sit_net_ops = { .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), - .async = true, }; static void __exit sit_cleanup(void) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 966c42af92f4..6fbdef630152 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -278,7 +278,6 @@ static void __net_exit ipv6_sysctl_net_exit(struct net *net) static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, - .async = true, }; static struct ctl_table_header *ip6_header; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5425d7b100ee..883df0ad5bfe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2007,7 +2007,6 @@ static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, .exit_batch = tcpv6_net_exit_batch, - .async = true, }; int __init tcpv6_init(void) diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index f3839780dc31..14ae32bb1f3d 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -123,7 +123,6 @@ static void __net_exit udplite6_proc_exit_net(struct net *net) static struct pernet_operations udplite6_net_ops = { .init = udplite6_proc_init_net, .exit = udplite6_proc_exit_net, - .async = true, }; int __init udplite6_proc_init(void) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index cbb270bd81b0..416fe67271a9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -400,7 +400,6 @@ static void __net_exit xfrm6_net_exit(struct net *net) static struct pernet_operations xfrm6_net_ops = { .init = xfrm6_net_init, .exit = xfrm6_net_exit, - .async = true, }; int __init xfrm6_init(void) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index a9673619e0e9..f85f0d7480ac 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -353,7 +353,6 @@ static struct pernet_operations xfrm6_tunnel_net_ops = { .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), - .async = true, }; static int __init xfrm6_tunnel_init(void) diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c index 4c2e9907f254..1fac92543094 100644 --- a/net/kcm/kcmproc.c +++ b/net/kcm/kcmproc.c @@ -433,7 +433,6 @@ static void kcm_proc_exit_net(struct net *net) static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, - .async = true, }; int __init kcm_proc_init(void) diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 516cfad71b85..dc76bc346829 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -2028,7 +2028,6 @@ static struct pernet_operations kcm_net_ops = { .exit = kcm_exit_net, .id = &kcm_net_id, .size = sizeof(struct kcm_net), - .async = true, }; static int __init kcm_init(void) diff --git a/net/key/af_key.c b/net/key/af_key.c index 3ac08ab26207..7e2e7188e7f4 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3863,7 +3863,6 @@ static struct pernet_operations pfkey_net_ops = { .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), - .async = true, }; static void __exit ipsec_pfkey_exit(void) diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index b86868da50d4..14b67dfacc4b 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1789,7 +1789,6 @@ static struct pernet_operations l2tp_net_ops = { .exit = l2tp_exit_net, .id = &l2tp_net_id, .size = sizeof(struct l2tp_net), - .async = true, }; static int __init l2tp_init(void) diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index f24504efe729..d6deca11da19 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1762,7 +1762,6 @@ static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, .id = &pppol2tp_net_id, - .async = true, }; /***************************************************************************** diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d4a89a8be013..7a4de6d618b1 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2488,7 +2488,6 @@ static void mpls_net_exit(struct net *net) static struct pernet_operations mpls_net_ops = { .init = mpls_net_init, .exit = mpls_net_exit, - .async = true, }; static struct rtnl_af_ops mpls_af_ops __read_mostly = { diff --git a/net/netfilter/core.c b/net/netfilter/core.c index d72cc786c7b7..0f6b8172fb9a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -629,7 +629,6 @@ static void __net_exit netfilter_net_exit(struct net *net) static struct pernet_operations netfilter_net_ops = { .init = netfilter_net_init, .exit = netfilter_net_exit, - .async = true, }; int __init netfilter_init(void) diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2523ebe2b3cc..bc4bd247bb7d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2095,7 +2095,6 @@ static struct pernet_operations ip_set_net_ops = { .exit = ip_set_net_exit, .id = &ip_set_net_id, .size = sizeof(struct ip_set_net), - .async = true, }; static int __init diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 6a6cb9db030b..5f6f73cf2174 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -2289,12 +2289,10 @@ static struct pernet_operations ipvs_core_ops = { .exit = __ip_vs_cleanup, .id = &ip_vs_net_id, .size = sizeof(struct netns_ipvs), - .async = true, }; static struct pernet_operations ipvs_core_dev_ops = { .exit = __ip_vs_dev_cleanup, - .async = true, }; /* diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 8b25aab41928..58d5d05aec24 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -479,7 +479,6 @@ static void __ip_vs_ftp_exit(struct net *net) static struct pernet_operations ip_vs_ftp_ops = { .init = __ip_vs_ftp_init, .exit = __ip_vs_ftp_exit, - .async = true, }; static int __init ip_vs_ftp_init(void) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 6a340c94c4b8..d625179de485 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -604,7 +604,6 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { } static struct pernet_operations ip_vs_lblc_ops = { .init = __ip_vs_lblc_init, .exit = __ip_vs_lblc_exit, - .async = true, }; static int __init ip_vs_lblc_init(void) diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 0627881128da..84c57b62a588 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -789,7 +789,6 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { } static struct pernet_operations ip_vs_lblcr_ops = { .init = __ip_vs_lblcr_init, .exit = __ip_vs_lblcr_exit, - .async = true, }; static int __init ip_vs_lblcr_init(void) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8884d302d33a..dd177ebee9aa 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3417,7 +3417,6 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, .exit_batch = ctnetlink_net_exit_batch, - .async = true, }; static int __init ctnetlink_init(void) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9bcd72fe91f9..d049ea5a3770 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -406,7 +406,6 @@ static struct pernet_operations proto_gre_net_ops = { .exit = proto_gre_net_exit, .id = &proto_gre_net_id, .size = sizeof(struct netns_proto_gre), - .async = true, }; static int __init nf_ct_proto_gre_init(void) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 98844c87d01e..037fec54c850 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -705,7 +705,6 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list) static struct pernet_operations nf_conntrack_net_ops = { .init = nf_conntrack_pernet_init, .exit_batch = nf_conntrack_pernet_exit, - .async = true, }; static int __init nf_conntrack_standalone_init(void) diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index a964e4d356cc..6d0357817cda 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -577,7 +577,6 @@ static void __net_exit nf_log_net_exit(struct net *net) static struct pernet_operations nf_log_net_ops = { .init = nf_log_net_init, .exit = nf_log_net_exit, - .async = true, }; int __init netfilter_log_init(void) diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c index 254c2c6bde48..350eb147754d 100644 --- a/net/netfilter/nf_log_netdev.c +++ b/net/netfilter/nf_log_netdev.c @@ -47,7 +47,6 @@ static void __net_exit nf_log_netdev_net_exit(struct net *net) static struct pernet_operations nf_log_netdev_net_ops = { .init = nf_log_netdev_net_init, .exit = nf_log_netdev_net_exit, - .async = true, }; static int __init nf_log_netdev_init(void) diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8f16fd27132d..6039b350abbe 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -398,7 +398,6 @@ static struct pernet_operations synproxy_net_ops = { .exit = synproxy_net_exit, .id = &synproxy_net_id, .size = sizeof(struct synproxy_net), - .async = true, }; static int __init synproxy_core_init(void) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd13d28e4ca7..c4acc7340eb1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6597,7 +6597,6 @@ static void __net_exit nf_tables_exit_net(struct net *net) static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .exit = nf_tables_exit_net, - .async = true, }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 84fc4954862d..03ead8a9e90c 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -566,7 +566,6 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list) static struct pernet_operations nfnetlink_net_ops = { .init = nfnetlink_net_init, .exit_batch = nfnetlink_net_exit_batch, - .async = true, }; static int __init nfnetlink_init(void) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 8d9f18bb8840..88d427f9f9e6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -515,7 +515,6 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) static struct pernet_operations nfnl_acct_ops = { .init = nfnl_acct_net_init, .exit = nfnl_acct_net_exit, - .async = true, }; static int __init nfnl_acct_init(void) diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 6819300f7fb7..95b04702a655 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -586,7 +586,6 @@ static void __net_exit cttimeout_net_exit(struct net *net) static struct pernet_operations cttimeout_ops = { .init = cttimeout_net_init, .exit = cttimeout_net_exit, - .async = true, }; static int __init cttimeout_init(void) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index b21ef79849a1..7b46aa4c478d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -1108,7 +1108,6 @@ static struct pernet_operations nfnl_log_net_ops = { .exit = nfnl_log_net_exit, .id = &nfnl_log_net_id, .size = sizeof(struct nfnl_log_net), - .async = true, }; static int __init nfnetlink_log_init(void) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 9f572ed56208..0b839c38800f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1525,7 +1525,6 @@ static struct pernet_operations nfnl_queue_net_ops = { .exit_batch = nfnl_queue_net_exit_batch, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), - .async = true, }; static int __init nfnetlink_queue_init(void) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 6de1f6a4cb80..4aa01c90e9d1 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1789,7 +1789,6 @@ static void __net_exit xt_net_exit(struct net *net) static struct pernet_operations xt_net_ops = { .init = xt_net_init, .exit = xt_net_exit, - .async = true, }; static int __init xt_init(void) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ef65b7a9173e..3360f13dc208 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1349,7 +1349,6 @@ static struct pernet_operations hashlimit_net_ops = { .exit = hashlimit_net_exit, .id = &hashlimit_net_id, .size = sizeof(struct hashlimit_net), - .async = true, }; static int __init hashlimit_mt_init(void) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 434e35ce940b..9bbfc17ce3ec 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -687,7 +687,6 @@ static struct pernet_operations recent_net_ops = { .exit = recent_net_exit, .id = &recent_net_id, .size = sizeof(struct recent_net), - .async = true, }; static struct xt_match recent_mt_reg[] __read_mostly = { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5d10dcfe6411..f1b02d87e336 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -253,7 +253,6 @@ static struct pernet_operations netlink_tap_net_ops = { .exit = netlink_tap_exit_net, .id = &netlink_tap_net_id, .size = sizeof(struct netlink_tap_net), - .async = true, }; static bool netlink_filter_tap(const struct sk_buff *skb) @@ -2726,7 +2725,6 @@ static void __init netlink_add_usersock_entry(void) static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, - .async = true, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index af51b8c0a2cb..b9ce82c9440f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1035,7 +1035,6 @@ static void __net_exit genl_pernet_exit(struct net *net) static struct pernet_operations genl_pernet_ops = { .init = genl_pernet_init, .exit = genl_pernet_exit, - .async = true, }; static int __init genl_init(void) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 100191df0371..ef38e5aecd28 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2384,7 +2384,6 @@ static struct pernet_operations ovs_net_ops = { .exit = ovs_exit_net, .id = &ovs_net_id, .size = sizeof(struct ovs_net), - .async = true, }; static int __init dp_init(void) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2c5a6fe5d749..616cb9c18f88 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4557,7 +4557,6 @@ static void __net_exit packet_net_exit(struct net *net) static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, - .async = true, }; diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index 9454e8393793..77787512fc32 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -342,7 +342,6 @@ static struct pernet_operations phonet_net_ops = { .exit = phonet_exit_net, .id = &phonet_net_id, .size = sizeof(struct phonet_net), - .async = true, }; /* Initialize Phonet devices list */ diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 4f3a32c38bf5..351a28474667 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -530,7 +530,6 @@ static struct pernet_operations rds_tcp_net_ops = { .exit = rds_tcp_exit_net, .id = &rds_tcp_netid, .size = sizeof(struct rds_tcp_net), - .async = true, }; void *rds_tcp_listen_sock_def_readable(struct net *net) diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 5fd939dabf41..f18c9248e0d4 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -106,5 +106,4 @@ struct pernet_operations rxrpc_net_ops = { .exit = rxrpc_exit_net, .id = &rxrpc_net_id, .size = sizeof(struct rxrpc_net), - .async = true, }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 7bd1b964f021..0d78b58e1898 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1533,7 +1533,6 @@ static struct pernet_operations tcf_action_net_ops = { .exit = tcf_action_net_exit, .id = &tcf_action_net_id, .size = sizeof(struct tcf_action_net), - .async = true, }; static int __init tc_action_init(void) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 5cb9b268e8ff..9092531d45d8 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -413,7 +413,6 @@ static struct pernet_operations bpf_net_ops = { .exit_batch = bpf_exit_net, .id = &bpf_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init bpf_init_module(void) diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 371e5e4ab3e2..e4b880fa51fe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -222,7 +222,6 @@ static struct pernet_operations connmark_net_ops = { .exit_batch = connmark_exit_net, .id = &connmark_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init connmark_init_module(void) diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index a527e287c086..7e28b2ce1437 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -678,7 +678,6 @@ static struct pernet_operations csum_net_ops = { .exit_batch = csum_exit_net, .id = &csum_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_DESCRIPTION("Checksum updating actions"); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 88fbb8403565..4dc4f153cad8 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -261,7 +261,6 @@ static struct pernet_operations gact_net_ops = { .exit_batch = gact_exit_net, .id = &gact_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 555b1caeff72..a5994cf0512b 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -870,7 +870,6 @@ static struct pernet_operations ife_net_ops = { .exit_batch = ife_exit_net, .id = &ife_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init ife_init_module(void) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index b5e8565b89c7..14c312d7908f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -352,7 +352,6 @@ static struct pernet_operations ipt_net_ops = { .exit_batch = ipt_exit_net, .id = &ipt_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int tcf_xt_walker(struct net *net, struct sk_buff *skb, @@ -403,7 +402,6 @@ static struct pernet_operations xt_net_ops = { .exit_batch = xt_exit_net, .id = &xt_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-13)"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 64c86579c3d9..fd34015331ab 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -353,7 +353,6 @@ static struct pernet_operations mirred_net_ops = { .exit_batch = mirred_exit_net, .id = &mirred_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index b1bc757f6491..4b5848b6c252 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -323,7 +323,6 @@ static struct pernet_operations nat_net_ops = { .exit_batch = nat_exit_net, .id = &nat_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_DESCRIPTION("Stateless NAT actions"); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f392ccaaa0d8..8a925c72db5f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -465,7 +465,6 @@ static struct pernet_operations pedit_net_ops = { .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 7081ec75e696..4e72bc2a0dfb 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -347,7 +347,6 @@ static struct pernet_operations police_net_ops = { .exit_batch = police_exit_net, .id = &police_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init police_init_module(void) diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3a89f98f17e6..5db358497c9e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -249,7 +249,6 @@ static struct pernet_operations sample_net_ops = { .exit_batch = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init sample_init_module(void) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index e84768ae610a..9618b4a83cee 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -216,7 +216,6 @@ static struct pernet_operations simp_net_ops = { .exit_batch = simp_exit_net, .id = &simp_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7971510fe61b..ddf69fc01bdf 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -253,7 +253,6 @@ static struct pernet_operations skbedit_net_ops = { .exit_batch = skbedit_exit_net, .id = &skbedit_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Alexander Duyck, "); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 142a996ac776..bbcbdce732cc 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -279,7 +279,6 @@ static struct pernet_operations skbmod_net_ops = { .exit_batch = skbmod_exit_net, .id = &skbmod_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; MODULE_AUTHOR("Jamal Hadi Salim, "); diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index a1c8dd406a04..626dac81a48a 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -339,7 +339,6 @@ static struct pernet_operations tunnel_key_net_ops = { .exit_batch = tunnel_key_exit_net, .id = &tunnel_key_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init tunnel_key_init_module(void) diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 41a66effeb5f..853604685965 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -314,7 +314,6 @@ static struct pernet_operations vlan_net_ops = { .exit_batch = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct tc_action_net), - .async = true, }; static int __init vlan_init_module(void) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ec5fe8ec0c3e..b66754f52a9f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1619,7 +1619,6 @@ static struct pernet_operations tcf_net_ops = { .exit = tcf_net_exit, .id = &tcf_net_id, .size = sizeof(struct tcf_net), - .async = true, }; static int __init tc_filter_init(void) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 68f9d942bed4..106dae7e4818 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -2133,7 +2133,6 @@ static void __net_exit psched_net_exit(struct net *net) static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, - .async = true, }; static int __init pktsched_init(void) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 493b817f6a2a..84a09f599131 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1283,7 +1283,6 @@ static void __net_exit sctp_defaults_exit(struct net *net) static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, - .async = true, }; static int __net_init sctp_ctrlsock_init(struct net *net) @@ -1307,7 +1306,6 @@ static void __net_init sctp_ctrlsock_exit(struct net *net) static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, - .async = true, }; /* Initialize the universe into something sensible. */ diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 44f939cb6bc8..9463af4b32e8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -2063,7 +2063,6 @@ static __net_exit void rpcsec_gss_exit_net(struct net *net) static struct pernet_operations rpcsec_gss_net_ops = { .init = rpcsec_gss_init_net, .exit = rpcsec_gss_exit_net, - .async = true, }; /* diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 68287e921847..56f9eff74150 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -79,7 +79,6 @@ static struct pernet_operations sunrpc_net_ops = { .exit = sunrpc_exit_net, .id = &sunrpc_net_id, .size = sizeof(struct sunrpc_net), - .async = true, }; static int __init diff --git a/net/sysctl_net.c b/net/sysctl_net.c index f424539829b7..9aed6fe1bf1a 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -89,7 +89,6 @@ static void __net_exit sysctl_net_exit(struct net *net) static struct pernet_operations sysctl_pernet_ops = { .init = sysctl_net_init, .exit = sysctl_net_exit, - .async = true, }; static struct ctl_table_header *net_header; diff --git a/net/tipc/core.c b/net/tipc/core.c index 52dfc51ac4d5..5b38f5164281 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -109,7 +109,6 @@ static struct pernet_operations tipc_net_ops = { .exit = tipc_exit_net, .id = &tipc_net_id, .size = sizeof(struct tipc_net), - .async = true, }; static int __init tipc_init(void) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bc2970a8e7f3..aded82da1aea 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2913,7 +2913,6 @@ static void __net_exit unix_net_exit(struct net *net) static struct pernet_operations unix_net_ops = { .init = unix_net_init, .exit = unix_net_exit, - .async = true, }; static int __init af_unix_init(void) diff --git a/net/wireless/core.c b/net/wireless/core.c index 670aa229168a..a6f3cac8c640 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1340,7 +1340,6 @@ static void __net_exit cfg80211_pernet_exit(struct net *net) static struct pernet_operations cfg80211_pernet_ops = { .exit = cfg80211_pernet_exit, - .async = true, }; static int __init cfg80211_init(void) diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index bc7064486b15..9efbfc753347 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -390,7 +390,6 @@ static void __net_exit wext_pernet_exit(struct net *net) static struct pernet_operations wext_pernet_ops = { .init = wext_pernet_init, .exit = wext_pernet_exit, - .async = true, }; static int __init wireless_nlevent_init(void) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cb3bb9ae4407..625b3fca5704 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2985,7 +2985,6 @@ static void __net_exit xfrm_net_exit(struct net *net) static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, - .async = true, }; void __init xfrm_init(void) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e92b8c019c88..080035f056d9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3253,7 +3253,6 @@ static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list) static struct pernet_operations xfrm_user_net_ops = { .init = xfrm_user_net_init, .exit_batch = xfrm_user_net_exit, - .async = true, }; static int __init xfrm_user_init(void) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index b4d7b6242a40..8644d864e3c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6743,7 +6743,6 @@ static void __net_exit selinux_nf_unregister(struct net *net) static struct pernet_operations selinux_net_ops = { .init = selinux_nf_register, .exit = selinux_nf_unregister, - .async = true, }; static int __init selinux_nf_ip_init(void) diff --git a/security/smack/smack_netfilter.c b/security/smack/smack_netfilter.c index 3f29c03162ca..e36d17835d4f 100644 --- a/security/smack/smack_netfilter.c +++ b/security/smack/smack_netfilter.c @@ -89,7 +89,6 @@ static void __net_exit smack_nf_unregister(struct net *net) static struct pernet_operations smack_net_ops = { .init = smack_nf_register, .exit = smack_nf_unregister, - .async = true, }; static int __init smack_nf_ip_init(void) -- cgit v1.2.3-70-g09d2 From 4420bf21fb6c0306e36ad58ade1e741fba57ce65 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:23 +0300 Subject: net: Rename net_sem to pernet_ops_rwsem net_sem is some undefined area name, so it will be better to make the area more defined. Rename it to pernet_ops_rwsem for better readability and better intelligibility. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- include/net/net_namespace.h | 12 +++++++----- net/core/net_namespace.c | 40 ++++++++++++++++++++-------------------- net/core/rtnetlink.c | 4 ++-- 4 files changed, 30 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 562a175c35a9..c7d1e4689325 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,7 +36,7 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct rw_semaphore net_sem; +extern struct rw_semaphore pernet_ops_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 37bcf8382b61..922e8b6fb422 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,9 +60,10 @@ struct net { struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit - * methods on dead net (net_sem - * read locked), or to unregister - * pernet ops (net_sem wr locked). + * methods on dead net ( + * pernet_ops_rwsem read locked), + * or to unregister pernet ops + * (pernet_ops_rwsem write locked). */ struct llist_node cleanup_list; /* namespaces on death row */ @@ -95,8 +96,9 @@ struct net { /* core fib_rules */ struct list_head rules_ops; - struct list_head fib_notifier_ops; /* protected by net_sem */ - + struct list_head fib_notifier_ops; /* Populated by + * register_pernet_subsys() + */ struct net_device *loopback_dev; /* The loopback */ struct netns_core core; struct netns_mib mib; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index eef17ad29dea..9e8ee4640451 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -41,10 +41,10 @@ EXPORT_SYMBOL(init_net); static bool init_net_initialized; /* - * net_sem: protects: pernet_list, net_generic_ids, + * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. */ -DECLARE_RWSEM(net_sem); +DECLARE_RWSEM(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) @@ -72,7 +72,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_sem)); + lockdep_is_held(&pernet_ops_rwsem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; @@ -289,7 +289,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { - /* Must be called with net_sem held */ + /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); @@ -422,13 +422,13 @@ struct net *copy_net_ns(unsigned long flags, net->ucounts = ucounts; get_user_ns(user_ns); - rv = down_read_killable(&net_sem); + rv = down_read_killable(&pernet_ops_rwsem); if (rv < 0) goto put_userns; rv = setup_net(net, user_ns); - up_read(&net_sem); + up_read(&pernet_ops_rwsem); if (rv < 0) { put_userns: @@ -480,7 +480,7 @@ static void cleanup_net(struct work_struct *work) /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); - down_read(&net_sem); + down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ rtnl_lock(); @@ -519,7 +519,7 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - up_read(&net_sem); + up_read(&pernet_ops_rwsem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -546,8 +546,8 @@ static void cleanup_net(struct work_struct *work) */ void net_ns_barrier(void) { - down_write(&net_sem); - up_write(&net_sem); + down_write(&pernet_ops_rwsem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL(net_ns_barrier); @@ -869,12 +869,12 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - down_write(&net_sem); + down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); init_net_initialized = true; - up_write(&net_sem); + up_write(&pernet_ops_rwsem); register_pernet_subsys(&net_ns_ops); @@ -1013,9 +1013,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - down_write(&net_sem); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(first_device, ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -1031,9 +1031,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - down_write(&net_sem); + down_write(&pernet_ops_rwsem); unregister_pernet_operations(ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -1059,11 +1059,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - down_write(&net_sem); + down_write(&pernet_ops_rwsem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - up_write(&net_sem); + up_write(&pernet_ops_rwsem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -1079,11 +1079,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - down_write(&net_sem); + down_write(&pernet_ops_rwsem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 31438b63d4b4..73011a60434c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -460,11 +460,11 @@ static void rtnl_lock_unregistering_all(void) void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with cleanup_net() */ - down_write(&net_sem); + down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); - up_write(&net_sem); + up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); -- cgit v1.2.3-70-g09d2 From 8518e9bb98b602eca0717d5aaad63ccbe56539d2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:32 +0300 Subject: net: Add more comments This adds comments to different places to improve readability. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/net_namespace.h | 4 ++++ net/core/net_namespace.c | 2 ++ net/core/rtnetlink.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 922e8b6fb422..1ab4f920f109 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -323,6 +323,10 @@ struct pernet_operations { * have to keep in mind all other pernet_operations and * to introduce a locking, if they share common resources. * + * The only time they are called with exclusive lock is + * from register_pernet_subsys(), unregister_pernet_subsys() + * register_pernet_device() and unregister_pernet_device(). + * * Exit methods using blocking RCU primitives, such as * synchronize_rcu(), should be implemented via exit_batch. * Then, destruction of a group of net requires single diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9e8ee4640451..b5796d17a302 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -43,6 +43,8 @@ static bool init_net_initialized; /* * pernet_ops_rwsem: protects: pernet_list, net_generic_ids, * init_net_initialized and first_device pointer. + * This is internal net namespace object. Please, don't use it + * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 73011a60434c..2d3949789cef 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void) */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { - /* Close the race with cleanup_net() */ + /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); -- cgit v1.2.3-70-g09d2 From 827efed6a66ef8a1c071400b5952fee4a5ffedf9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Mar 2018 23:02:47 +0100 Subject: rxrpc: Trace resend Add a tracepoint to trace packet resend events and to dump the Tx annotation buffer for added illumination. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 24 ++++++++++++++++++++++++ net/rxrpc/call_event.c | 1 + 2 files changed, 25 insertions(+) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 36cb50c111a6..41979f907575 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -420,6 +420,7 @@ rxrpc_rtt_rx_traces; rxrpc_timer_traces; rxrpc_propose_ack_traces; rxrpc_propose_ack_outcomes; +rxrpc_congest_modes; rxrpc_congest_changes; /* @@ -1229,6 +1230,29 @@ TRACE_EVENT(rxrpc_connect_call, __entry->call_id) ); +TRACE_EVENT(rxrpc_resend, + TP_PROTO(struct rxrpc_call *call, int ix), + + TP_ARGS(call, ix), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(int, ix ) + __array(u8, anno, 64 ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->ix = ix; + memcpy(__entry->anno, call->rxtx_annotations, 64); + ), + + TP_printk("c=%p ix=%u a=%64phN", + __entry->call, + __entry->ix, + __entry->anno) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index ad2ab1103189..6a62e42e1d8d 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -195,6 +195,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * the packets in the Tx buffer we're going to resend and what the new * resend timeout will be. */ + trace_rxrpc_resend(call, (cursor + 1) & RXRPC_RXTX_BUFF_MASK); oldest = now; for (seq = cursor + 1; before_eq(seq, top); seq++) { ix = seq & RXRPC_RXTX_BUFF_MASK; -- cgit v1.2.3-70-g09d2 From a25e21f0bcd25673b91b97b9805db33350feec0f Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Mar 2018 23:03:00 +0100 Subject: rxrpc, afs: Use debug_ids rather than pointers in traces In rxrpc and afs, use the debug_ids that are monotonically allocated to various objects as they're allocated rather than pointers as kernel pointers are now hashed making them less useful. Further, the debug ids aren't reused anywhere nearly as quickly. In addition, allow kernel services that use rxrpc, such as afs, to take numbers from the rxrpc counter, assign them to their own call struct and pass them in to rxrpc for both client and service calls so that the trace lines for each will have the same ID tag. Signed-off-by: David Howells --- fs/afs/internal.h | 1 + fs/afs/rxrpc.c | 12 ++-- include/net/af_rxrpc.h | 11 ++- include/trace/events/afs.h | 69 +++++++++---------- include/trace/events/rxrpc.h | 155 ++++++++++++++++++++++--------------------- net/rxrpc/af_rxrpc.c | 7 +- net/rxrpc/ar-internal.h | 8 +-- net/rxrpc/call_accept.c | 18 +++-- net/rxrpc/call_object.c | 15 +++-- net/rxrpc/conn_event.c | 3 +- net/rxrpc/input.c | 6 +- net/rxrpc/sendmsg.c | 3 +- 12 files changed, 163 insertions(+), 145 deletions(-) (limited to 'net') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f38d6a561a84..72217170b155 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -118,6 +118,7 @@ struct afs_call { bool ret_reply0; /* T if should return reply[0] on success */ bool upgrade; /* T to request service upgrade */ u16 service_id; /* Actual service ID (after upgrade) */ + unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e1126659f043..b819900916e6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -131,6 +131,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->type = type; call->net = net; + call->debug_id = atomic_inc_return(&rxrpc_debug_id); atomic_set(&call->usage, 1); INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); @@ -169,11 +170,12 @@ void afs_put_call(struct afs_call *call) afs_put_server(call->net, call->cm_server); afs_put_cb_interest(call->net, call->cbi); kfree(call->request); - kfree(call); - o = atomic_dec_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_free, 0, o, __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); if (o == 0) wake_up_atomic_t(&net->nr_outstanding_calls); } @@ -378,7 +380,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter), - call->upgrade); + call->upgrade, + call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); goto error_kill_call; @@ -727,7 +730,8 @@ void afs_charge_preallocation(struct work_struct *work) afs_wake_up_async_call, afs_rx_attach, (unsigned long)call, - GFP_KERNEL) < 0) + GFP_KERNEL, + call->debug_id) < 0) break; call = NULL; } diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 2b3a6eec4570..8ae8ee004258 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -31,6 +31,11 @@ enum rxrpc_call_completion { NR__RXRPC_CALL_COMPLETIONS }; +/* + * Debug ID counter for tracing. + */ +extern atomic_t rxrpc_debug_id; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, @@ -50,7 +55,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *, s64, gfp_t, rxrpc_notify_rx_t, - bool); + bool, + unsigned int); int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t, rxrpc_notify_end_tx_t); @@ -63,7 +69,8 @@ void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *); u64 rxrpc_kernel_get_rtt(struct socket *, struct rxrpc_call *); int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t); + rxrpc_user_attach_call_t, unsigned long, gfp_t, + unsigned int); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); int rxrpc_kernel_retry_call(struct socket *, struct rxrpc_call *, struct sockaddr_rxrpc *, struct key *); diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 6b59c63a8e51..63815f66b274 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -133,8 +133,7 @@ TRACE_EVENT(afs_recv_data, TP_ARGS(call, count, offset, want_more, ret), TP_STRUCT__entry( - __field(struct rxrpc_call *, rxcall ) - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(enum afs_call_state, state ) __field(unsigned int, count ) __field(unsigned int, offset ) @@ -144,8 +143,7 @@ TRACE_EVENT(afs_recv_data, ), TP_fast_assign( - __entry->rxcall = call->rxcall; - __entry->call = call; + __entry->call = call->debug_id; __entry->state = call->state; __entry->unmarshall = call->unmarshall; __entry->count = count; @@ -154,8 +152,7 @@ TRACE_EVENT(afs_recv_data, __entry->ret = ret; ), - TP_printk("c=%p ac=%p s=%u u=%u %u/%u wm=%u ret=%d", - __entry->rxcall, + TP_printk("c=%08x s=%u u=%u %u/%u wm=%u ret=%d", __entry->call, __entry->state, __entry->unmarshall, __entry->offset, __entry->count, @@ -168,21 +165,18 @@ TRACE_EVENT(afs_notify_call, TP_ARGS(rxcall, call), TP_STRUCT__entry( - __field(struct rxrpc_call *, rxcall ) - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(enum afs_call_state, state ) __field(unsigned short, unmarshall ) ), TP_fast_assign( - __entry->rxcall = rxcall; - __entry->call = call; + __entry->call = call->debug_id; __entry->state = call->state; __entry->unmarshall = call->unmarshall; ), - TP_printk("c=%p ac=%p s=%u u=%u", - __entry->rxcall, + TP_printk("c=%08x s=%u u=%u", __entry->call, __entry->state, __entry->unmarshall) ); @@ -193,21 +187,18 @@ TRACE_EVENT(afs_cb_call, TP_ARGS(call), TP_STRUCT__entry( - __field(struct rxrpc_call *, rxcall ) - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(const char *, name ) __field(u32, op ) ), TP_fast_assign( - __entry->rxcall = call->rxcall; - __entry->call = call; + __entry->call = call->debug_id; __entry->name = call->type->name; __entry->op = call->operation_ID; ), - TP_printk("c=%p ac=%p %s o=%u", - __entry->rxcall, + TP_printk("c=%08x %s o=%u", __entry->call, __entry->name, __entry->op) @@ -220,7 +211,7 @@ TRACE_EVENT(afs_call, TP_ARGS(call, op, usage, outstanding, where), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(int, op ) __field(int, usage ) __field(int, outstanding ) @@ -228,14 +219,14 @@ TRACE_EVENT(afs_call, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->op = op; __entry->usage = usage; __entry->outstanding = outstanding; __entry->where = where; ), - TP_printk("c=%p %s u=%d o=%d sp=%pSR", + TP_printk("c=%08x %s u=%d o=%d sp=%pSR", __entry->call, __print_symbolic(__entry->op, afs_call_traces), __entry->usage, @@ -249,13 +240,13 @@ TRACE_EVENT(afs_make_fs_call, TP_ARGS(call, fid), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(enum afs_fs_operation, op ) __field_struct(struct afs_fid, fid ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->op = call->operation_ID; if (fid) { __entry->fid = *fid; @@ -266,7 +257,7 @@ TRACE_EVENT(afs_make_fs_call, } ), - TP_printk("c=%p %06x:%06x:%06x %s", + TP_printk("c=%08x %06x:%06x:%06x %s", __entry->call, __entry->fid.vid, __entry->fid.vnode, @@ -280,16 +271,16 @@ TRACE_EVENT(afs_make_vl_call, TP_ARGS(call), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(enum afs_vl_operation, op ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->op = call->operation_ID; ), - TP_printk("c=%p %s", + TP_printk("c=%08x %s", __entry->call, __print_symbolic(__entry->op, afs_vl_operations)) ); @@ -300,20 +291,20 @@ TRACE_EVENT(afs_call_done, TP_ARGS(call), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(struct rxrpc_call *, rx_call ) __field(int, ret ) __field(u32, abort_code ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->rx_call = call->rxcall; __entry->ret = call->error; __entry->abort_code = call->abort_code; ), - TP_printk(" c=%p ret=%d ab=%d [%p]", + TP_printk(" c=%08x ret=%d ab=%d [%p]", __entry->call, __entry->ret, __entry->abort_code, @@ -327,7 +318,7 @@ TRACE_EVENT(afs_send_pages, TP_ARGS(call, msg, first, last, offset), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(pgoff_t, first ) __field(pgoff_t, last ) __field(unsigned int, nr ) @@ -337,7 +328,7 @@ TRACE_EVENT(afs_send_pages, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->first = first; __entry->last = last; __entry->nr = msg->msg_iter.nr_segs; @@ -346,7 +337,7 @@ TRACE_EVENT(afs_send_pages, __entry->flags = msg->msg_flags; ), - TP_printk(" c=%p %lx-%lx-%lx b=%x o=%x f=%x", + TP_printk(" c=%08x %lx-%lx-%lx b=%x o=%x f=%x", __entry->call, __entry->first, __entry->first + __entry->nr - 1, __entry->last, __entry->bytes, __entry->offset, @@ -360,7 +351,7 @@ TRACE_EVENT(afs_sent_pages, TP_ARGS(call, first, last, cursor, ret), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(pgoff_t, first ) __field(pgoff_t, last ) __field(pgoff_t, cursor ) @@ -368,14 +359,14 @@ TRACE_EVENT(afs_sent_pages, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->first = first; __entry->last = last; __entry->cursor = cursor; __entry->ret = ret; ), - TP_printk(" c=%p %lx-%lx c=%lx r=%d", + TP_printk(" c=%08x %lx-%lx c=%lx r=%d", __entry->call, __entry->first, __entry->last, __entry->cursor, __entry->ret) @@ -450,7 +441,7 @@ TRACE_EVENT(afs_call_state, TP_ARGS(call, from, to, ret, remote_abort), TP_STRUCT__entry( - __field(struct afs_call *, call ) + __field(unsigned int, call ) __field(enum afs_call_state, from ) __field(enum afs_call_state, to ) __field(int, ret ) @@ -458,14 +449,14 @@ TRACE_EVENT(afs_call_state, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->from = from; __entry->to = to; __entry->ret = ret; __entry->abort = remote_abort; ), - TP_printk("c=%p %u->%u r=%d ab=%d", + TP_printk("c=%08x %u->%u r=%d ab=%d", __entry->call, __entry->from, __entry->to, __entry->ret, __entry->abort) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 41979f907575..4d2c2d35c5cb 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -439,20 +439,20 @@ TRACE_EVENT(rxrpc_conn, TP_ARGS(conn, op, usage, where), TP_STRUCT__entry( - __field(struct rxrpc_connection *, conn ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) + __field(unsigned int, conn ) + __field(int, op ) + __field(int, usage ) + __field(const void *, where ) ), TP_fast_assign( - __entry->conn = conn; + __entry->conn = conn->debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; ), - TP_printk("C=%p %s u=%d sp=%pSR", + TP_printk("C=%08x %s u=%d sp=%pSR", __entry->conn, __print_symbolic(__entry->op, rxrpc_conn_traces), __entry->usage, @@ -466,7 +466,7 @@ TRACE_EVENT(rxrpc_client, TP_ARGS(conn, channel, op), TP_STRUCT__entry( - __field(struct rxrpc_connection *, conn ) + __field(unsigned int, conn ) __field(u32, cid ) __field(int, channel ) __field(int, usage ) @@ -475,7 +475,7 @@ TRACE_EVENT(rxrpc_client, ), TP_fast_assign( - __entry->conn = conn; + __entry->conn = conn->debug_id; __entry->channel = channel; __entry->usage = atomic_read(&conn->usage); __entry->op = op; @@ -483,7 +483,7 @@ TRACE_EVENT(rxrpc_client, __entry->cs = conn->cache_state; ), - TP_printk("C=%p h=%2d %s %s i=%08x u=%d", + TP_printk("C=%08x h=%2d %s %s i=%08x u=%d", __entry->conn, __entry->channel, __print_symbolic(__entry->op, rxrpc_client_traces), @@ -499,7 +499,7 @@ TRACE_EVENT(rxrpc_call, TP_ARGS(call, op, usage, where, aux), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(int, op ) __field(int, usage ) __field(const void *, where ) @@ -507,14 +507,14 @@ TRACE_EVENT(rxrpc_call, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; __entry->aux = aux; ), - TP_printk("c=%p %s u=%d sp=%pSR a=%p", + TP_printk("c=%08x %s u=%d sp=%pSR a=%p", __entry->call, __print_symbolic(__entry->op, rxrpc_call_traces), __entry->usage, @@ -593,12 +593,13 @@ TRACE_EVENT(rxrpc_rx_done, ); TRACE_EVENT(rxrpc_abort, - TP_PROTO(const char *why, u32 cid, u32 call_id, rxrpc_seq_t seq, - int abort_code, int error), + TP_PROTO(unsigned int call_nr, const char *why, u32 cid, u32 call_id, + rxrpc_seq_t seq, int abort_code, int error), - TP_ARGS(why, cid, call_id, seq, abort_code, error), + TP_ARGS(call_nr, why, cid, call_id, seq, abort_code, error), TP_STRUCT__entry( + __field(unsigned int, call_nr ) __array(char, why, 4 ) __field(u32, cid ) __field(u32, call_id ) @@ -609,6 +610,7 @@ TRACE_EVENT(rxrpc_abort, TP_fast_assign( memcpy(__entry->why, why, 4); + __entry->call_nr = call_nr; __entry->cid = cid; __entry->call_id = call_id; __entry->abort_code = abort_code; @@ -616,7 +618,8 @@ TRACE_EVENT(rxrpc_abort, __entry->seq = seq; ), - TP_printk("%08x:%08x s=%u a=%d e=%d %s", + TP_printk("c=%08x %08x:%08x s=%u a=%d e=%d %s", + __entry->call_nr, __entry->cid, __entry->call_id, __entry->seq, __entry->abort_code, __entry->error, __entry->why) ); @@ -627,7 +630,7 @@ TRACE_EVENT(rxrpc_transmit, TP_ARGS(call, why), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_transmit_trace, why ) __field(rxrpc_seq_t, tx_hard_ack ) __field(rxrpc_seq_t, tx_top ) @@ -635,14 +638,14 @@ TRACE_EVENT(rxrpc_transmit, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->tx_hard_ack = call->tx_hard_ack; __entry->tx_top = call->tx_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%p %s f=%08x n=%u/%u", + TP_printk("c=%08x %s f=%08x n=%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_transmit_traces), __entry->tx_hard_ack + 1, @@ -657,7 +660,7 @@ TRACE_EVENT(rxrpc_rx_data, TP_ARGS(call, seq, serial, flags, anno), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) __field(u8, flags ) @@ -665,14 +668,14 @@ TRACE_EVENT(rxrpc_rx_data, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; __entry->anno = anno; ), - TP_printk("c=%p DATA %08x q=%08x fl=%02x a=%02x", + TP_printk("c=%08x DATA %08x q=%08x fl=%02x a=%02x", __entry->call, __entry->serial, __entry->seq, @@ -688,7 +691,7 @@ TRACE_EVENT(rxrpc_rx_ack, TP_ARGS(call, serial, ack_serial, first, prev, reason, n_acks), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_serial_t, serial ) __field(rxrpc_serial_t, ack_serial ) __field(rxrpc_seq_t, first ) @@ -698,7 +701,7 @@ TRACE_EVENT(rxrpc_rx_ack, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->serial = serial; __entry->ack_serial = ack_serial; __entry->first = first; @@ -707,7 +710,7 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks = n_acks; ), - TP_printk("c=%p %08x %s r=%08x f=%08x p=%08x n=%u", + TP_printk("c=%08x %08x %s r=%08x f=%08x p=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), @@ -724,18 +727,18 @@ TRACE_EVENT(rxrpc_rx_abort, TP_ARGS(call, serial, abort_code), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_serial_t, serial ) __field(u32, abort_code ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->serial = serial; __entry->abort_code = abort_code; ), - TP_printk("c=%p ABORT %08x ac=%d", + TP_printk("c=%08x ABORT %08x ac=%d", __entry->call, __entry->serial, __entry->abort_code) @@ -748,20 +751,20 @@ TRACE_EVENT(rxrpc_rx_rwind_change, TP_ARGS(call, serial, rwind, wake), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_serial_t, serial ) __field(u32, rwind ) __field(bool, wake ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->serial = serial; __entry->rwind = rwind; __entry->wake = wake; ), - TP_printk("c=%p %08x rw=%u%s", + TP_printk("c=%08x %08x rw=%u%s", __entry->call, __entry->serial, __entry->rwind, @@ -775,7 +778,7 @@ TRACE_EVENT(rxrpc_tx_data, TP_ARGS(call, seq, serial, flags, retrans, lose), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) __field(u8, flags ) @@ -784,7 +787,7 @@ TRACE_EVENT(rxrpc_tx_data, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; @@ -792,7 +795,7 @@ TRACE_EVENT(rxrpc_tx_data, __entry->lose = lose; ), - TP_printk("c=%p DATA %08x q=%08x fl=%02x%s%s", + TP_printk("c=%08x DATA %08x q=%08x fl=%02x%s%s", __entry->call, __entry->serial, __entry->seq, @@ -809,7 +812,7 @@ TRACE_EVENT(rxrpc_tx_ack, TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_serial_t, serial ) __field(rxrpc_seq_t, ack_first ) __field(rxrpc_serial_t, ack_serial ) @@ -818,7 +821,7 @@ TRACE_EVENT(rxrpc_tx_ack, ), TP_fast_assign( - __entry->call = call; + __entry->call = call ? call->debug_id : 0; __entry->serial = serial; __entry->ack_first = ack_first; __entry->ack_serial = ack_serial; @@ -826,7 +829,7 @@ TRACE_EVENT(rxrpc_tx_ack, __entry->n_acks = n_acks; ), - TP_printk(" c=%p ACK %08x %s f=%08x r=%08x n=%u", + TP_printk(" c=%08x ACK %08x %s f=%08x r=%08x n=%u", __entry->call, __entry->serial, __print_symbolic(__entry->reason, rxrpc_ack_names), @@ -842,7 +845,7 @@ TRACE_EVENT(rxrpc_receive, TP_ARGS(call, why, serial, seq), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_receive_trace, why ) __field(rxrpc_serial_t, serial ) __field(rxrpc_seq_t, seq ) @@ -851,7 +854,7 @@ TRACE_EVENT(rxrpc_receive, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->seq = seq; @@ -859,7 +862,7 @@ TRACE_EVENT(rxrpc_receive, __entry->top = call->rx_top; ), - TP_printk("c=%p %s r=%08x q=%08x w=%08x-%08x", + TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_receive_traces), __entry->serial, @@ -876,7 +879,7 @@ TRACE_EVENT(rxrpc_recvmsg, TP_ARGS(call, why, seq, offset, len, ret), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_recvmsg_trace, why ) __field(rxrpc_seq_t, seq ) __field(unsigned int, offset ) @@ -885,7 +888,7 @@ TRACE_EVENT(rxrpc_recvmsg, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->seq = seq; __entry->offset = offset; @@ -893,7 +896,7 @@ TRACE_EVENT(rxrpc_recvmsg, __entry->ret = ret; ), - TP_printk("c=%p %s q=%08x o=%u l=%u ret=%d", + TP_printk("c=%08x %s q=%08x o=%u l=%u ret=%d", __entry->call, __print_symbolic(__entry->why, rxrpc_recvmsg_traces), __entry->seq, @@ -909,18 +912,18 @@ TRACE_EVENT(rxrpc_rtt_tx, TP_ARGS(call, why, send_serial), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_rtt_tx_trace, why ) __field(rxrpc_serial_t, send_serial ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->send_serial = send_serial; ), - TP_printk("c=%p %s sr=%08x", + TP_printk("c=%08x %s sr=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_rtt_tx_traces), __entry->send_serial) @@ -934,7 +937,7 @@ TRACE_EVENT(rxrpc_rtt_rx, TP_ARGS(call, why, send_serial, resp_serial, rtt, nr, avg), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_rtt_rx_trace, why ) __field(u8, nr ) __field(rxrpc_serial_t, send_serial ) @@ -944,7 +947,7 @@ TRACE_EVENT(rxrpc_rtt_rx, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->send_serial = send_serial; __entry->resp_serial = resp_serial; @@ -953,7 +956,7 @@ TRACE_EVENT(rxrpc_rtt_rx, __entry->avg = avg; ), - TP_printk("c=%p %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld", + TP_printk("c=%08x %s sr=%08x rr=%08x rtt=%lld nr=%u avg=%lld", __entry->call, __print_symbolic(__entry->why, rxrpc_rtt_rx_traces), __entry->send_serial, @@ -970,7 +973,7 @@ TRACE_EVENT(rxrpc_timer, TP_ARGS(call, why, now), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_timer_trace, why ) __field(long, now ) __field(long, ack_at ) @@ -984,7 +987,7 @@ TRACE_EVENT(rxrpc_timer, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->now = now; __entry->ack_at = call->ack_at; @@ -996,7 +999,7 @@ TRACE_EVENT(rxrpc_timer, __entry->timer = call->timer.expires; ), - TP_printk("c=%p %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", + TP_printk("c=%08x %s a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", __entry->call, __print_symbolic(__entry->why, rxrpc_timer_traces), __entry->ack_at - __entry->now, @@ -1039,7 +1042,7 @@ TRACE_EVENT(rxrpc_propose_ack, outcome), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_propose_ack_trace, why ) __field(rxrpc_serial_t, serial ) __field(u8, ack_reason ) @@ -1049,7 +1052,7 @@ TRACE_EVENT(rxrpc_propose_ack, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; @@ -1058,7 +1061,7 @@ TRACE_EVENT(rxrpc_propose_ack, __entry->outcome = outcome; ), - TP_printk("c=%p %s %s r=%08x i=%u b=%u%s", + TP_printk("c=%08x %s %s r=%08x i=%u b=%u%s", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), @@ -1075,20 +1078,20 @@ TRACE_EVENT(rxrpc_retransmit, TP_ARGS(call, seq, annotation, expiry), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(u8, annotation ) __field(s64, expiry ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->seq = seq; __entry->annotation = annotation; __entry->expiry = expiry; ), - TP_printk("c=%p q=%x a=%02x xp=%lld", + TP_printk("c=%08x q=%x a=%02x xp=%lld", __entry->call, __entry->seq, __entry->annotation, @@ -1102,7 +1105,7 @@ TRACE_EVENT(rxrpc_congest, TP_ARGS(call, summary, ack_serial, change), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(enum rxrpc_congest_change, change ) __field(rxrpc_seq_t, hard_ack ) __field(rxrpc_seq_t, top ) @@ -1112,7 +1115,7 @@ TRACE_EVENT(rxrpc_congest, ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->change = change; __entry->hard_ack = call->tx_hard_ack; __entry->top = call->tx_top; @@ -1121,7 +1124,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%p r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1146,16 +1149,16 @@ TRACE_EVENT(rxrpc_disconnect_call, TP_ARGS(call), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(u32, abort_code ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), - TP_printk("c=%p ab=%08x", + TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); @@ -1166,16 +1169,16 @@ TRACE_EVENT(rxrpc_improper_term, TP_ARGS(call), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(u32, abort_code ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->abort_code = call->abort_code; ), - TP_printk("c=%p ab=%08x", + TP_printk("c=%08x ab=%08x", __entry->call, __entry->abort_code) ); @@ -1187,18 +1190,18 @@ TRACE_EVENT(rxrpc_rx_eproto, TP_ARGS(call, serial, why), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(rxrpc_serial_t, serial ) __field(const char *, why ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->serial = serial; __entry->why = why; ), - TP_printk("c=%p EPROTO %08x %s", + TP_printk("c=%08x EPROTO %08x %s", __entry->call, __entry->serial, __entry->why) @@ -1210,20 +1213,20 @@ TRACE_EVENT(rxrpc_connect_call, TP_ARGS(call), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(unsigned long, user_call_ID ) __field(u32, cid ) __field(u32, call_id ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->user_call_ID = call->user_call_ID; __entry->cid = call->cid; __entry->call_id = call->call_id; ), - TP_printk("c=%p u=%p %08x:%08x", + TP_printk("c=%08x u=%p %08x:%08x", __entry->call, (void *)__entry->user_call_ID, __entry->cid, @@ -1236,18 +1239,18 @@ TRACE_EVENT(rxrpc_resend, TP_ARGS(call, ix), TP_STRUCT__entry( - __field(struct rxrpc_call *, call ) + __field(unsigned int, call ) __field(int, ix ) __array(u8, anno, 64 ) ), TP_fast_assign( - __entry->call = call; + __entry->call = call->debug_id; __entry->ix = ix; memcpy(__entry->anno, call->rxtx_annotations, 64); ), - TP_printk("c=%p ix=%u a=%64phN", + TP_printk("c=%08x ix=%u a=%64phN", __entry->call, __entry->ix, __entry->anno) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9e1c2c6b6a67..ec5ec68be1aa 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -40,6 +40,7 @@ static const struct proto_ops rxrpc_rpc_ops; /* current debugging ID */ atomic_t rxrpc_debug_id; +EXPORT_SYMBOL(rxrpc_debug_id); /* count of skbs currently in use */ atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; @@ -267,6 +268,7 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @upgrade: Request service upgrade for call + * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and @@ -282,7 +284,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, s64 tx_total_len, gfp_t gfp, rxrpc_notify_rx_t notify_rx, - bool upgrade) + bool upgrade, + unsigned int debug_id) { struct rxrpc_conn_parameters cp; struct rxrpc_call_params p; @@ -314,7 +317,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp); + call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 416688381eb7..9c9817ddafc5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -691,7 +691,6 @@ struct rxrpc_send_params { * af_rxrpc.c */ extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; -extern atomic_t rxrpc_debug_id; extern struct workqueue_struct *rxrpc_workqueue; /* @@ -732,11 +731,12 @@ extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); -struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t); +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, - struct rxrpc_call_params *, gfp_t); + struct rxrpc_call_params *, gfp_t, + unsigned int); int rxrpc_retry_client_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, @@ -822,7 +822,7 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call, rxrpc_seq_t seq, u32 abort_code, int error) { - trace_rxrpc_abort(why, call->cid, call->call_id, seq, + trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq, abort_code, error); return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED, abort_code, error); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 3028298ca561..92ebd1d7e0bb 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, - unsigned long user_call_ID, gfp_t gfp) + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id) { const void *here = __builtin_return_address(0); struct rxrpc_call *call; @@ -94,7 +95,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, /* Now it gets complicated, because calls get registered with the * socket here, particularly if a user ID is preassigned by the user. */ - call = rxrpc_alloc_call(rx, gfp); + call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); @@ -174,7 +175,8 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp) if (rx->discard_new_call) return 0; - while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0) + while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp, + atomic_inc_return(&rxrpc_debug_id)) == 0) ; return 0; @@ -347,7 +349,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, service_id == rx->second_service)) goto found_service; - trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, EOPNOTSUPP); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; skb->priority = RX_INVALID_OPERATION; @@ -358,7 +360,7 @@ found_service: spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { - trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber, + trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN); skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT; skb->priority = RX_INVALID_OPERATION; @@ -635,6 +637,7 @@ out_discard: * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. + * @debug_id: The tracing debug ID. * * Charge up the socket with preallocated calls, each with a user ID. A * function should be provided to effect the attachment from the user's side. @@ -645,7 +648,8 @@ out_discard: int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, rxrpc_user_attach_call_t user_attach_call, - unsigned long user_call_ID, gfp_t gfp) + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct rxrpc_backlog *b = rx->backlog; @@ -655,6 +659,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, return rxrpc_service_prealloc_one(rx, b, notify_rx, user_attach_call, user_call_ID, - gfp); + gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 0b2db38dd32d..147657dfe757 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -99,7 +99,8 @@ found_extant_call: /* * allocate a new call */ -struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) +struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, + unsigned int debug_id) { struct rxrpc_call *call; @@ -138,7 +139,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp) spin_lock_init(&call->notify_lock); rwlock_init(&call->state_lock); atomic_set(&call->usage, 1); - call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->debug_id = debug_id; call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; @@ -166,14 +167,15 @@ nomem: */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, - gfp_t gfp) + gfp_t gfp, + unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; _enter(""); - call = rxrpc_alloc_call(rx, gfp); + call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; @@ -214,7 +216,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_conn_parameters *cp, struct sockaddr_rxrpc *srx, struct rxrpc_call_params *p, - gfp_t gfp) + gfp_t gfp, + unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call, *xcall; @@ -225,7 +228,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, _enter("%p,%lx", rx, p->user_call_ID); - call = rxrpc_alloc_client_call(rx, srx, gfp); + call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); _leave(" = %ld", PTR_ERR(call)); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b1dfae107431..d2ec3fd593e8 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -160,7 +160,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, lockdep_is_held(&conn->channel_lock)); if (call) { if (compl == RXRPC_CALL_LOCALLY_ABORTED) - trace_rxrpc_abort("CON", call->cid, + trace_rxrpc_abort(call->debug_id, + "CON", call->cid, call->call_id, 0, abort_code, error); if (rxrpc_set_call_completion(call, compl, diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 6fc61400337f..2a868fdab0ae 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1307,21 +1307,21 @@ out_unlock: wrong_security: rcu_read_unlock(); - trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RXKADINCONSISTENCY, EBADMSG); skb->priority = RXKADINCONSISTENCY; goto post_abort; reupgrade: rcu_read_unlock(); - trace_rxrpc_abort("UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); goto protocol_error; bad_message_unlock: rcu_read_unlock(); bad_message: - trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); protocol_error: skb->priority = RX_PROTOCOL_ERROR; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 09f2a3e05221..8503f279b467 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -579,7 +579,8 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, cp.exclusive = rx->exclusive | p->exclusive; cp.upgrade = p->upgrade; cp.service_id = srx->srx_service; - call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL); + call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL, + atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ _leave(" = %p\n", call); -- cgit v1.2.3-70-g09d2 From 1bae5d229532b4e8dfd5728cb3b8373bc9eec9eb Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 27 Mar 2018 23:08:20 +0100 Subject: rxrpc: Trace call completion Add a tracepoint to track rxrpc calls moving into the completed state and to log the completion type and the recorded error value and abort code. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 33 +++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 1 + 2 files changed, 34 insertions(+) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4d2c2d35c5cb..2ea788f6f95d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -400,6 +400,13 @@ enum rxrpc_congest_change { EM(RXRPC_ACK_IDLE, "IDL") \ E_(RXRPC_ACK__INVALID, "-?-") +#define rxrpc_completions \ + EM(RXRPC_CALL_SUCCEEDED, "Succeeded") \ + EM(RXRPC_CALL_REMOTELY_ABORTED, "RemoteAbort") \ + EM(RXRPC_CALL_LOCALLY_ABORTED, "LocalAbort") \ + EM(RXRPC_CALL_LOCAL_ERROR, "LocalError") \ + E_(RXRPC_CALL_NETWORK_ERROR, "NetError") + /* * Export enum symbols via userspace. */ @@ -624,6 +631,32 @@ TRACE_EVENT(rxrpc_abort, __entry->abort_code, __entry->error, __entry->why) ); +TRACE_EVENT(rxrpc_call_complete, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_call_completion, compl ) + __field(int, error ) + __field(u32, abort_code ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->compl = call->completion; + __entry->error = call->error; + __entry->abort_code = call->abort_code; + ), + + TP_printk("c=%08x %s r=%d ac=%d", + __entry->call, + __print_symbolic(__entry->compl, rxrpc_completions), + __entry->error, + __entry->abort_code) + ); + TRACE_EVENT(rxrpc_transmit, TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9c9817ddafc5..21cf164b6d85 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -778,6 +778,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call, call->error = error; call->completion = compl, call->state = RXRPC_CALL_COMPLETE; + trace_rxrpc_call_complete(call); wake_up(&call->waitq); return true; } -- cgit v1.2.3-70-g09d2 From 6f5c39fa5cd4a78c5432021e981aa8f79437a32c Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Mon, 26 Mar 2018 08:36:57 -0700 Subject: bpf: Add sock_ops R/W access to ipv4 tos Sample usage for tos ... bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v)) ... where skops is a pointer to the ctx (struct bpf_sock_ops). Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann --- net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 00c711c5f1a2..afd825534ac4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3462,6 +3462,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET + } else if (level == SOL_IP) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct inet_sock *inet = inet_sk(sk); + + if (val == -1) + val = 0; + inet->tos = val; + } + break; + default: + ret = -EINVAL; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) @@ -3561,6 +3582,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } + } else if (level == SOL_IP) { + struct inet_sock *inet = inet_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + *((int *)optval) = (int)inet->tos; + break; + default: + goto err_clear; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); -- cgit v1.2.3-70-g09d2 From c105547501016897194358b11451608a8d5f9a02 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:32 -0700 Subject: treewide: remove large struct-pass-by-value from tracepoint arguments - fix trace_hfi1_ctxt_info() to pass large struct by reference instead of by value - convert 'type array[]' tracepoint arguments into 'type *array', since compiler will warn that sizeof('type array[]') == sizeof('type *array') and later should be used instead The CAST_TO_U64 macro in the later patch will enforce that tracepoint arguments can only be integers, pointers, or less than 8 byte structures. Larger structures should be passed by reference. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/trace_ctxts.h | 12 ++++++------ include/trace/events/f2fs.h | 2 +- net/wireless/trace.h | 2 +- sound/firewire/amdtp-stream-trace.h | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 41fafebe3b0d..da4aa1a95b11 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1153,7 +1153,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) cinfo.sdma_ring_size = fd->cq->nentries; cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; - trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo); if (copy_to_user((void __user *)arg, &cinfo, len)) return -EFAULT; diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h index 4eb4cc798035..e00c8a7d559c 100644 --- a/drivers/infiniband/hw/hfi1/trace_ctxts.h +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -106,7 +106,7 @@ TRACE_EVENT(hfi1_uctxtdata, TRACE_EVENT(hfi1_ctxt_info, TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt, unsigned int subctxt, - struct hfi1_ctxt_info cinfo), + struct hfi1_ctxt_info *cinfo), TP_ARGS(dd, ctxt, subctxt, cinfo), TP_STRUCT__entry(DD_DEV_ENTRY(dd) __field(unsigned int, ctxt) @@ -120,11 +120,11 @@ TRACE_EVENT(hfi1_ctxt_info, TP_fast_assign(DD_DEV_ASSIGN(dd); __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __entry->egrtids = cinfo.egrtids; - __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; - __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; - __entry->sdma_ring_size = cinfo.sdma_ring_size; - __entry->rcvegr_size = cinfo.rcvegr_size; + __entry->egrtids = cinfo->egrtids; + __entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo->rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo->sdma_ring_size; + __entry->rcvegr_size = cinfo->rcvegr_size; ), TP_printk("[%s] ctxt %u:%u " CINFO_FMT, __get_str(dev), diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 06c87f9f720c..795698925d20 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -491,7 +491,7 @@ DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node, TRACE_EVENT(f2fs_truncate_partial_nodes, - TP_PROTO(struct inode *inode, nid_t nid[], int depth, int err), + TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err), TP_ARGS(inode, nid, depth, err), diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5152938b358d..018c81fa72fb 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3137,7 +3137,7 @@ TRACE_EVENT(rdev_start_radar_detection, TRACE_EVENT(rdev_set_mcast_rate, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - int mcast_rate[NUM_NL80211_BANDS]), + int *mcast_rate), TP_ARGS(wiphy, netdev, mcast_rate), TP_STRUCT__entry( WIPHY_ENTRY diff --git a/sound/firewire/amdtp-stream-trace.h b/sound/firewire/amdtp-stream-trace.h index ea0d486652c8..54cdd4ffa9ce 100644 --- a/sound/firewire/amdtp-stream-trace.h +++ b/sound/firewire/amdtp-stream-trace.h @@ -14,7 +14,7 @@ #include TRACE_EVENT(in_packet, - TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 cip_header[2], unsigned int payload_length, unsigned int index), + TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 *cip_header, unsigned int payload_length, unsigned int index), TP_ARGS(s, cycles, cip_header, payload_length, index), TP_STRUCT__entry( __field(unsigned int, second) -- cgit v1.2.3-70-g09d2 From 14624a9387c4d80766d1fdd237b0d110a0a1b43d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:34 -0700 Subject: net/mac802154: disambiguate mac80215 vs mac802154 trace events two trace events defined with the same name and both unused. They conflict in allyesconfig build. Rename one of them. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- net/mac802154/trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 2c8a43d3607f..df855c33daf2 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -33,7 +33,7 @@ /* Tracing for driver callbacks */ -DECLARE_EVENT_CLASS(local_only_evt, +DECLARE_EVENT_CLASS(local_only_evt4, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local), TP_STRUCT__entry( @@ -45,7 +45,7 @@ DECLARE_EVENT_CLASS(local_only_evt, TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); -DEFINE_EVENT(local_only_evt, 802154_drv_return_void, +DEFINE_EVENT(local_only_evt4, 802154_drv_return_void, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); @@ -65,12 +65,12 @@ TRACE_EVENT(802154_drv_return_int, __entry->ret) ); -DEFINE_EVENT(local_only_evt, 802154_drv_start, +DEFINE_EVENT(local_only_evt4, 802154_drv_start, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); -DEFINE_EVENT(local_only_evt, 802154_drv_stop, +DEFINE_EVENT(local_only_evt4, 802154_drv_stop, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); -- cgit v1.2.3-70-g09d2 From 57566b20033f23bdc9f25d5fa4c36a7287aa08d2 Mon Sep 17 00:00:00 2001 From: "tamizhr@codeaurora.org" Date: Tue, 27 Mar 2018 19:16:16 +0530 Subject: mac80211: Use proper smps_mode enum in sta opmode event SMPS_MODE change value notified via nl80211 contains mac80211 specific value(ieee80211_smps_mode) and user space application will not know those values. This patch add support to map the mac80211 enum value to nl80211_smps_mode which will be understood by the userspace application. Signed-off-by: Tamizh chelvam Signed-off-by: Johannes Berg --- net/mac80211/ht.c | 15 +++++++++++++++ net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/rx.c | 3 ++- 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index d7523530d3f8..c78036a0ac94 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -466,6 +466,21 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST); } +enum nl80211_smps_mode +ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps) +{ + switch (smps) { + case IEEE80211_SMPS_OFF: + return NL80211_SMPS_OFF; + case IEEE80211_SMPS_STATIC: + return NL80211_SMPS_STATIC; + case IEEE80211_SMPS_DYNAMIC: + return NL80211_SMPS_DYNAMIC; + default: + return NL80211_SMPS_OFF; + } +} + int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index ae9c33cd8ada..9237ffb4e986 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1788,6 +1788,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs); +enum nl80211_smps_mode +ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps); /* VHT */ void diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 27bb1f0b5e52..f8c69acfda48 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2883,7 +2883,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (rx->sta->sta.smps_mode == smps_mode) goto handled; rx->sta->sta.smps_mode = smps_mode; - sta_opmode.smps_mode = smps_mode; + sta_opmode.smps_mode = + ieee80211_smps_mode_to_smps_mode(smps_mode); sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED; sband = rx->local->hw.wiphy->bands[status->band]; -- cgit v1.2.3-70-g09d2 From 97f5f4254a61d7a104b5e609a6a0d9a10c73d29e Mon Sep 17 00:00:00 2001 From: "tamizhr@codeaurora.org" Date: Tue, 27 Mar 2018 19:16:17 +0530 Subject: mac80211: Use proper chan_width enum in sta opmode event Bandwidth change value reported via nl80211 contains mac80211 specific enum value(ieee80211_sta_rx_bw) and which is not understand by userspace application. Map the mac80211 specific value to nl80211_chan_width enum value to avoid using wrong value in the userspace application. And used station's ht/vht capability to map IEEE80211_STA_RX_BW_20 and IEEE80211_STA_RX_BW_160 with proper nl80211 value. Signed-off-by: Tamizh chelvam Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/rx.c | 3 ++- net/mac80211/vht.c | 32 +++++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9237ffb4e986..6c341d89448c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1816,6 +1816,8 @@ void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]); +enum nl80211_chan_width +ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index f8c69acfda48..3a9f0c0a2de8 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2922,7 +2922,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) rx->sta->sta.bandwidth = new_bw; sband = rx->local->hw.wiphy->bands[status->band]; - sta_opmode.bw = new_bw; + sta_opmode.bw = + ieee80211_sta_rx_bw_to_chan_width(rx->sta); sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED; rate_control_rate_update(local, sband, rx->sta, diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 5714dee76b12..259325cbcc31 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -358,6 +358,36 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) return NL80211_CHAN_WIDTH_80; } +enum nl80211_chan_width +ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta) +{ + enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.bandwidth; + struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; + u32 cap_width; + + switch (cur_bw) { + case IEEE80211_STA_RX_BW_20: + if (!sta->sta.ht_cap.ht_supported) + return NL80211_CHAN_WIDTH_20_NOHT; + else + return NL80211_CHAN_WIDTH_20; + case IEEE80211_STA_RX_BW_40: + return NL80211_CHAN_WIDTH_40; + case IEEE80211_STA_RX_BW_80: + return NL80211_CHAN_WIDTH_80; + case IEEE80211_STA_RX_BW_160: + cap_width = + vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; + + if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) + return NL80211_CHAN_WIDTH_160; + + return NL80211_CHAN_WIDTH_80P80; + default: + return NL80211_CHAN_WIDTH_20; + } +} + enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { @@ -484,7 +514,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, new_bw = ieee80211_sta_cur_vht_bw(sta); if (new_bw != sta->sta.bandwidth) { sta->sta.bandwidth = new_bw; - sta_opmode.bw = new_bw; + sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } -- cgit v1.2.3-70-g09d2 From 850964519a65a96aa8231ff31e28f30c2633484d Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Mon, 26 Mar 2018 16:36:31 +0300 Subject: cfg80211: fix CAC_STARTED event handling Exclude CAC_STARTED event from !wdev->cac_started check, since cac_started will be set later in the same function. Signed-off-by: Dmitry Lebed Signed-off-by: Johannes Berg --- net/wireless/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 6b6818dd76bd..12b3edf70a7b 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -872,7 +872,7 @@ void cfg80211_cac_event(struct net_device *netdev, trace_cfg80211_cac_event(netdev, event); - if (WARN_ON(!wdev->cac_started)) + if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED)) return; if (WARN_ON(!wdev->chandef.chan)) -- cgit v1.2.3-70-g09d2 From 2c390e44e46675da0e7bba5c3b921a091a60ec2c Mon Sep 17 00:00:00 2001 From: Dmitry Lebed Date: Mon, 26 Mar 2018 16:36:32 +0300 Subject: cfg80211: enable use of non-cleared DFS channels for DFS offload Currently channel switch/start_ap to DFS channel cannot be done to non-CAC-cleared channel even if DFS offload if enabled. Make non-cleared DFS channels available if DFS offload is enabled. CAC will be started by HW after channel change, start_ap call, etc. Signed-off-by: Dmitry Lebed Signed-off-by: Johannes Berg --- net/wireless/chan.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/chan.c b/net/wireless/chan.c index a48859982a32..2db713d18f71 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -579,6 +579,10 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, { struct ieee80211_channel *c; u32 freq, start_freq, end_freq; + bool dfs_offload; + + dfs_offload = wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_DFS_OFFLOAD); start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); @@ -596,8 +600,9 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, if (c->flags & IEEE80211_CHAN_DISABLED) return false; - if ((c->flags & IEEE80211_CHAN_RADAR) && - (c->dfs_state != NL80211_DFS_AVAILABLE)) + if ((c->flags & IEEE80211_CHAN_RADAR) && + (c->dfs_state != NL80211_DFS_AVAILABLE) && + !(c->dfs_state == NL80211_DFS_USABLE && dfs_offload)) return false; } -- cgit v1.2.3-70-g09d2 From 37b1c004685a3cea420dd96aa3803da627359f60 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:44 -0500 Subject: cfg80211: Support all iftypes in autodisconnect_wk Currently autodisconnect_wk assumes that only interface types of P2P_CLIENT and STATION use conn_owner_nlportid. Change this so all interface types are supported. Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- net/wireless/sme.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 701cfd7acc1b..5df6b33db786 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1239,17 +1239,38 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) wdev_lock(wdev); if (wdev->conn_owner_nlportid) { - /* - * Use disconnect_bssid if still connecting and ops->disconnect - * not implemented. Otherwise we can use cfg80211_disconnect. - */ - if (rdev->ops->disconnect || wdev->current_bss) - cfg80211_disconnect(rdev, wdev->netdev, - WLAN_REASON_DEAUTH_LEAVING, true); - else - cfg80211_mlme_deauth(rdev, wdev->netdev, - wdev->disconnect_bssid, NULL, 0, - WLAN_REASON_DEAUTH_LEAVING, false); + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + cfg80211_leave_ibss(rdev, wdev->netdev, false); + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + cfg80211_stop_ap(rdev, wdev->netdev, false); + break; + case NL80211_IFTYPE_MESH_POINT: + cfg80211_leave_mesh(rdev, wdev->netdev); + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + /* + * Use disconnect_bssid if still connecting and + * ops->disconnect not implemented. Otherwise we can + * use cfg80211_disconnect. + */ + if (rdev->ops->disconnect || wdev->current_bss) + cfg80211_disconnect(rdev, wdev->netdev, + WLAN_REASON_DEAUTH_LEAVING, + true); + else + cfg80211_mlme_deauth(rdev, wdev->netdev, + wdev->disconnect_bssid, + NULL, 0, + WLAN_REASON_DEAUTH_LEAVING, + false); + break; + default: + break; + } } wdev_unlock(wdev); -- cgit v1.2.3-70-g09d2 From f8d16d3edb4dbae080df04318423c360de3c594d Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:45 -0500 Subject: nl80211: Add SOCKET_OWNER support to JOIN_IBSS Signed-off-by: Denis Kenzior [johannes: fix race with wdev lock/unlock by just acquiring once] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/core.h | 8 ++++---- net/wireless/ibss.c | 27 ++++++--------------------- net/wireless/nl80211.c | 7 ++++++- 4 files changed, 18 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 60fefc5a2ea4..266b43c4f6e1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1962,6 +1962,8 @@ enum nl80211_commands { * multicast group. * If set during %NL80211_CMD_ASSOCIATE or %NL80211_CMD_CONNECT the * station will deauthenticate when the socket is closed. + * If set during %NL80211_CMD_JOIN_IBSS the IBSS will be automatically + * torn down when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/core.h b/net/wireless/core.h index eaff636169c2..b5cf3ea8d4df 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -282,10 +282,10 @@ void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs); /* IBSS */ -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys); +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys); void cfg80211_clear_ibss(struct net_device *dev, bool nowext); int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, bool nowext); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index a1d10993d08a..d1743e6abc34 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -84,14 +84,15 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid, } EXPORT_SYMBOL(cfg80211_ibss_joined); -static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) +int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ibss_params *params, + struct cfg80211_cached_keys *connkeys) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; + ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); if (wdev->ssid_len) @@ -146,23 +147,6 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, return 0; } -int cfg80211_join_ibss(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct cfg80211_ibss_params *params, - struct cfg80211_cached_keys *connkeys) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - ASSERT_RTNL(); - - wdev_lock(wdev); - err = __cfg80211_join_ibss(rdev, dev, params, connkeys); - wdev_unlock(wdev); - - return err; -} - static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) { struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -224,6 +208,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev, if (err) return err; + wdev->conn_owner_nlportid = 0; __cfg80211_clear_ibss(dev, nowext); return 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fe27ab443d01..13f7c002f562 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8679,9 +8679,14 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) kzfree(connkeys); + else if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + return err; } -- cgit v1.2.3-70-g09d2 From 188c1b3c04d69e842122daf201f07a34fcfad039 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:46 -0500 Subject: nl80211: Add SOCKET_OWNER support to JOIN_MESH Signed-off-by: Denis Kenzior [johannes: fix race with wdev lock/unlock by just acquiring once] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/core.h | 4 ---- net/wireless/mesh.c | 16 +--------------- net/wireless/nl80211.c | 10 ++++++++-- 4 files changed, 11 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 266b43c4f6e1..98eb37def37d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1964,6 +1964,8 @@ enum nl80211_commands { * station will deauthenticate when the socket is closed. * If set during %NL80211_CMD_JOIN_IBSS the IBSS will be automatically * torn down when the socket is closed. + * If set during %NL80211_CMD_JOIN_MESH the mesh setup will be + * automatically torn down when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/core.h b/net/wireless/core.h index b5cf3ea8d4df..63eb1b5fdd04 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -303,10 +303,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_setup *setup, const struct mesh_config *conf); -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf); int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index b12da6ef3c12..eac5aa1419fc 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -217,21 +217,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, return err; } -int cfg80211_join_mesh(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct mesh_setup *setup, - const struct mesh_config *conf) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - int err; - - wdev_lock(wdev); - err = __cfg80211_join_mesh(rdev, dev, setup, conf); - wdev_unlock(wdev); - - return err; -} - int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) @@ -286,6 +271,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, err = rdev_leave_mesh(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->mesh_id_len = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 13f7c002f562..1d6e81e5b2c8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10092,7 +10092,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) if (err) return err; } else { - /* cfg80211_join_mesh() will sort it out */ + /* __cfg80211_join_mesh() will sort it out */ setup.chandef.chan = NULL; } @@ -10130,7 +10130,13 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) setup.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); - return cfg80211_join_mesh(rdev, dev, &setup, &cfg); + wdev_lock(dev->ieee80211_ptr); + err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg); + if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) + dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid; + wdev_unlock(dev->ieee80211_ptr); + + return err; } static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info) -- cgit v1.2.3-70-g09d2 From 466a306142c002b40deaa58da94741af4153d1c4 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:47 -0500 Subject: nl80211: Add SOCKET_OWNER support to START_AP Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ net/wireless/ap.c | 1 + net/wireless/nl80211.c | 3 +++ 3 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 98eb37def37d..9ea3d6039eca 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1966,6 +1966,8 @@ enum nl80211_commands { * torn down when the socket is closed. * If set during %NL80211_CMD_JOIN_MESH the mesh setup will be * automatically torn down when the socket is closed. + * If set during %NL80211_CMD_START_AP the AP will be automatically + * disabled when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. diff --git a/net/wireless/ap.c b/net/wireless/ap.c index 63682176c96c..882d97bdc6bf 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -27,6 +27,7 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, err = rdev_stop_ap(rdev, dev); if (!err) { + wdev->conn_owner_nlportid = 0; wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); wdev->ssid_len = 0; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1d6e81e5b2c8..cfaf2aeb9783 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4134,6 +4134,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev->chandef = params.chandef; wdev->ssid_len = params.ssid_len; memcpy(wdev->ssid, params.ssid, wdev->ssid_len); + + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) + wdev->conn_owner_nlportid = info->snd_portid; } wdev_unlock(wdev); -- cgit v1.2.3-70-g09d2 From 230ebaa189af44d50dccb4a1846e39ca594e347b Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Wed, 28 Mar 2018 13:24:09 +0300 Subject: cfg80211: read wmm rules from regulatory database ETSI EN 301 893 v2.1.1 (2017-05) standard defines a new channel access mechanism that all devices (WLAN and LAA) need to comply with. The regulatory database can now be loaded into the kernel and also has the option to load optional data. In order to be able to comply with ETSI standard, we add wmm_rule into regulatory rule and add the option to read its value from the regulatory database. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho [johannes: fix memory leak in error path] Signed-off-by: Johannes Berg --- include/net/regulatory.h | 28 +++++++++ net/wireless/reg.c | 148 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 169 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/regulatory.h b/include/net/regulatory.h index f83cacce3308..60f8cc86a447 100644 --- a/include/net/regulatory.h +++ b/include/net/regulatory.h @@ -4,6 +4,7 @@ * regulatory support structures * * Copyright 2008-2009 Luis R. Rodriguez + * Copyright (C) 2018 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -188,9 +189,35 @@ struct ieee80211_power_rule { u32 max_eirp; }; +/** + * struct ieee80211_wmm_ac - used to store per ac wmm regulatory limitation + * + * The information provided in this structure is required for QoS + * transmit queue configuration. Cf. IEEE 802.11 7.3.2.29. + * + * @cw_min: minimum contention window [a value of the form + * 2^n-1 in the range 1..32767] + * @cw_max: maximum contention window [like @cw_min] + * @cot: maximum burst time in units of 32 usecs, 0 meaning disabled + * @aifsn: arbitration interframe space [0..255] + * + */ +struct ieee80211_wmm_ac { + u16 cw_min; + u16 cw_max; + u16 cot; + u8 aifsn; +}; + +struct ieee80211_wmm_rule { + struct ieee80211_wmm_ac client[IEEE80211_NUM_ACS]; + struct ieee80211_wmm_ac ap[IEEE80211_NUM_ACS]; +}; + struct ieee80211_reg_rule { struct ieee80211_freq_range freq_range; struct ieee80211_power_rule power_rule; + struct ieee80211_wmm_rule *wmm_rule; u32 flags; u32 dfs_cac_ms; }; @@ -198,6 +225,7 @@ struct ieee80211_reg_rule { struct ieee80211_regdomain { struct rcu_head rcu_head; u32 n_reg_rules; + u32 n_wmm_rules; char alpha2[3]; enum nl80211_dfs_regions dfs_region; struct ieee80211_reg_rule reg_rules[]; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 7b42f0bacfd8..eddc834f6358 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,6 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -424,23 +425,36 @@ static const struct ieee80211_regdomain * reg_copy_regd(const struct ieee80211_regdomain *src_regd) { struct ieee80211_regdomain *regd; - int size_of_regd; + int size_of_regd, size_of_wmms; unsigned int i; + struct ieee80211_wmm_rule *d_wmm, *s_wmm; size_of_regd = sizeof(struct ieee80211_regdomain) + src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule); + size_of_wmms = src_regd->n_wmm_rules * + sizeof(struct ieee80211_wmm_rule); - regd = kzalloc(size_of_regd, GFP_KERNEL); + regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL); if (!regd) return ERR_PTR(-ENOMEM); memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain)); - for (i = 0; i < src_regd->n_reg_rules; i++) + d_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd); + s_wmm = (struct ieee80211_wmm_rule *)((u8 *)src_regd + size_of_regd); + memcpy(d_wmm, s_wmm, size_of_wmms); + + for (i = 0; i < src_regd->n_reg_rules; i++) { memcpy(®d->reg_rules[i], &src_regd->reg_rules[i], sizeof(struct ieee80211_reg_rule)); + if (!src_regd->reg_rules[i].wmm_rule) + continue; + regd->reg_rules[i].wmm_rule = d_wmm + + (src_regd->reg_rules[i].wmm_rule - s_wmm) / + sizeof(struct ieee80211_wmm_rule); + } return regd; } @@ -595,6 +609,17 @@ enum fwdb_flags { FWDB_FLAG_AUTO_BW = BIT(4), }; +struct fwdb_wmm_ac { + u8 ecw; + u8 aifsn; + __be16 cot; +} __packed; + +struct fwdb_wmm_rule { + struct fwdb_wmm_ac client[IEEE80211_NUM_ACS]; + struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS]; +} __packed; + struct fwdb_rule { u8 len; u8 flags; @@ -602,6 +627,7 @@ struct fwdb_rule { __be32 start, end, max_bw; /* start of optional data */ __be16 cac_timeout; + __be16 wmm_ptr; } __packed __aligned(4); #define FWDB_MAGIC 0x52474442 @@ -613,6 +639,31 @@ struct fwdb_header { struct fwdb_country country[]; } __packed __aligned(4); +static int ecw2cw(int ecw) +{ + return (1 << ecw) - 1; +} + +static bool valid_wmm(struct fwdb_wmm_rule *rule) +{ + struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule; + int i; + + for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) { + u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4); + u16 cw_max = ecw2cw(ac[i].ecw & 0x0f); + u8 aifsn = ac[i].aifsn; + + if (cw_min >= cw_max) + return false; + + if (aifsn < 1) + return false; + } + + return true; +} + static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) { struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2)); @@ -623,7 +674,18 @@ static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr) /* mandatory fields */ if (rule->len < offsetofend(struct fwdb_rule, max_bw)) return false; + if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { + u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; + struct fwdb_wmm_rule *wmm; + if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size) + return false; + + wmm = (void *)(data + wmm_ptr); + + if (!valid_wmm(wmm)) + return false; + } return true; } @@ -798,23 +860,64 @@ static bool valid_regdb(const u8 *data, unsigned int size) return true; } +static void set_wmm_rule(struct ieee80211_wmm_rule *rule, + struct fwdb_wmm_rule *wmm) +{ + unsigned int i; + + for (i = 0; i < IEEE80211_NUM_ACS; i++) { + rule->client[i].cw_min = + ecw2cw((wmm->client[i].ecw & 0xf0) >> 4); + rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f); + rule->client[i].aifsn = wmm->client[i].aifsn; + rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot); + rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4); + rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f); + rule->ap[i].aifsn = wmm->ap[i].aifsn; + rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot); + } +} + +struct wmm_ptrs { + struct ieee80211_wmm_rule *rule; + u32 ptr; +}; + +static struct ieee80211_wmm_rule *find_wmm_ptr(struct wmm_ptrs *wmm_ptrs, + u32 wmm_ptr, int n_wmms) +{ + int i; + + for (i = 0; i < n_wmms; i++) { + if (wmm_ptrs[i].ptr == wmm_ptr) + return wmm_ptrs[i].rule; + } + return NULL; +} + static int regdb_query_country(const struct fwdb_header *db, const struct fwdb_country *country) { unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; struct fwdb_collection *coll = (void *)((u8 *)db + ptr); struct ieee80211_regdomain *regdom; - unsigned int size_of_regd; - unsigned int i; + struct ieee80211_regdomain *tmp_rd; + unsigned int size_of_regd, i, n_wmms = 0; + struct wmm_ptrs *wmm_ptrs; - size_of_regd = - sizeof(struct ieee80211_regdomain) + + size_of_regd = sizeof(struct ieee80211_regdomain) + coll->n_rules * sizeof(struct ieee80211_reg_rule); regdom = kzalloc(size_of_regd, GFP_KERNEL); if (!regdom) return -ENOMEM; + wmm_ptrs = kcalloc(coll->n_rules, sizeof(*wmm_ptrs), GFP_KERNEL); + if (!wmm_ptrs) { + kfree(regdom); + return -ENOMEM; + } + regdom->n_reg_rules = coll->n_rules; regdom->alpha2[0] = country->alpha2[0]; regdom->alpha2[1] = country->alpha2[1]; @@ -851,7 +954,38 @@ static int regdb_query_country(const struct fwdb_header *db, if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout)) rrule->dfs_cac_ms = 1000 * be16_to_cpu(rule->cac_timeout); + if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) { + u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2; + struct ieee80211_wmm_rule *wmm_pos = + find_wmm_ptr(wmm_ptrs, wmm_ptr, n_wmms); + struct fwdb_wmm_rule *wmm; + struct ieee80211_wmm_rule *wmm_rule; + + if (wmm_pos) { + rrule->wmm_rule = wmm_pos; + continue; + } + wmm = (void *)((u8 *)db + wmm_ptr); + tmp_rd = krealloc(regdom, size_of_regd + (n_wmms + 1) * + sizeof(struct ieee80211_wmm_rule), + GFP_KERNEL); + + if (!tmp_rd) { + kfree(regdom); + return -ENOMEM; + } + regdom = tmp_rd; + + wmm_rule = (struct ieee80211_wmm_rule *) + ((u8 *)regdom + size_of_regd + n_wmms * + sizeof(struct ieee80211_wmm_rule)); + + set_wmm_rule(wmm_rule, wmm); + wmm_ptrs[n_wmms].ptr = wmm_ptr; + wmm_ptrs[n_wmms++].rule = wmm_rule; + } } + kfree(wmm_ptrs); return reg_schedule_apply(regdom); } -- cgit v1.2.3-70-g09d2 From 5bf16a11ba2940c67d0b3a2154813bb42e8c6281 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 27 Feb 2018 11:22:15 +0100 Subject: cfg80211: don't require RTNL held for regdomain reads The whole code is set up to allow RCU reads of this data, but then uses rtnl_dereference() which requires the RTNL. Convert it to rcu_dereference_rtnl() which makes it require only RCU or the RTNL, to allow RCU-protected reading of the data. Reviewed-by: Coelho, Luciano Signed-off-by: Johannes Berg --- net/wireless/reg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index eddc834f6358..e352a0d1c438 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -135,12 +135,12 @@ static void restore_regulatory_settings(bool reset_user); static const struct ieee80211_regdomain *get_cfg80211_regdom(void) { - return rtnl_dereference(cfg80211_regdomain); + return rcu_dereference_rtnl(cfg80211_regdomain); } const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { - return rtnl_dereference(wiphy->regd); + return rcu_dereference_rtnl(wiphy->regd); } static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region) -- cgit v1.2.3-70-g09d2 From e552af058148498c8a0874edf6b022caea9cb2b7 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Wed, 28 Mar 2018 13:24:10 +0300 Subject: mac80211: limit wmm params to comply with ETSI requirements ETSI has recently added new requirements that restrict the WMM parameter values for 5GHz frequencies. We need to take care of the following scenarios in order to comply with these new requirements: 1. When using mac80211 default values; 2. When the userspace tries to configure its own values; 3. When associating to an AP which advertises WWM IE. When associating to an AP, the client uses the values in the advertised WMM IE. But the AP may not comply with the new ETSI requirements, so the client needs to check the current regulatory rules and use those limits accordingly. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 +++ net/mac80211/ieee80211_i.h | 4 ++++ net/mac80211/mlme.c | 1 + net/mac80211/util.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5c4b105ca398..36d128ebbac8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This file is GPLv2 as found in COPYING. */ @@ -2156,6 +2157,8 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, */ p.uapsd = false; + ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac); + sdata->tx_conf[params->ac] = p; if (drv_conf_tx(local, sdata, params->ac, &p)) { wiphy_debug(local->hw.wiphy, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6c341d89448c..5b3d78362fb5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -4,6 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1869,6 +1870,9 @@ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); +void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, + struct ieee80211_tx_queue_params *qparam, + int ac); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos); void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ea3ba03fb952..a3bcf953d423 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1792,6 +1792,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, params[ac].cw_min, params[ac].cw_max, aci); return false; } + ieee80211_regulatory_limit_wmm_params(sdata, ¶ms[ac], ac); } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 55cd2922627a..11f9cfc016d9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -5,6 +5,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1113,6 +1114,48 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return crc; } +void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, + struct ieee80211_tx_queue_params + *qparam, int ac) +{ + struct ieee80211_chanctx_conf *chanctx_conf; + const struct ieee80211_reg_rule *rrule; + struct ieee80211_wmm_ac *wmm_ac; + u16 center_freq = 0; + + if (sdata->vif.type != NL80211_IFTYPE_AP && + sdata->vif.type != NL80211_IFTYPE_STATION) + return; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + if (chanctx_conf) + center_freq = chanctx_conf->def.chan->center_freq; + + if (!center_freq) { + rcu_read_unlock(); + return; + } + + rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq)); + + if (IS_ERR_OR_NULL(rrule) || !rrule->wmm_rule) { + rcu_read_unlock(); + return; + } + + if (sdata->vif.type == NL80211_IFTYPE_AP) + wmm_ac = &rrule->wmm_rule->ap[ac]; + else + wmm_ac = &rrule->wmm_rule->client[ac]; + qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min); + qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max); + qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn); + qparam->txop = !qparam->txop ? wmm_ac->cot / 32 : + min_t(u16, qparam->txop, wmm_ac->cot / 32); + rcu_read_unlock(); +} + void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, bool bss_notify, bool enable_qos) { @@ -1206,6 +1249,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, break; } } + ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac); qparam.uapsd = false; -- cgit v1.2.3-70-g09d2 From 19d3577e35e0cbb42694811b096e749a0f89a824 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Wed, 28 Mar 2018 13:24:11 +0300 Subject: cfg80211: Add API to allow querying regdb for wmm_rule In general regulatory self managed devices maintain their own regulatory profiles thus it doesn't have to query the regulatory database on country change. ETSI has recently introduced a new channel access mechanism for 5GHz that all wlan devices need to comply with. These values are stored in the regulatory database. There are self managed devices which can't maintain these values on their own. Add API to allow self managed regulatory devices to query the regulatory database for high band wmm rule. Signed-off-by: Haim Dreyfuss Signed-off-by: Luca Coelho [johannes: fix documentation] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 28 ++++++++++++++++++++++++++ net/wireless/reg.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4341508bc6a4..bfe174896fcf 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6,6 +6,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH + * Copyright (C) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -4657,6 +4658,33 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, */ const char *reg_initiator_name(enum nl80211_reg_initiator initiator); +/** + * DOC: Internal regulatory db functions + * + */ + +/** + * reg_query_regdb_wmm - Query internal regulatory db for wmm rule + * Regulatory self-managed driver can use it to proactively + * + * @alpha2: the ISO/IEC 3166 alpha2 wmm rule to be queried. + * @freq: the freqency(in MHz) to be queried. + * @ptr: pointer where the regdb wmm data is to be stored (or %NULL if + * irrelevant). This can be used later for deduplication. + * @rule: pointer to store the wmm rule from the regulatory db. + * + * Self-managed wireless drivers can use this function to query + * the internal regulatory database to check whether the given + * ISO/IEC 3166 alpha2 country and freq have wmm rule limitations. + * + * Drivers should check the return value, its possible you can get + * an -ENODATA. + * + * Return: 0 on success. -ENODATA. + */ +int reg_query_regdb_wmm(char *alpha2, int freq, u32 *ptr, + struct ieee80211_wmm_rule *rule); + /* * callbacks for asynchronous cfg80211 methods, notification * functions and BSS handling helpers diff --git a/net/wireless/reg.c b/net/wireless/reg.c index e352a0d1c438..16c7e4ef5820 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -878,6 +878,60 @@ static void set_wmm_rule(struct ieee80211_wmm_rule *rule, } } +static int __regdb_query_wmm(const struct fwdb_header *db, + const struct fwdb_country *country, int freq, + u32 *dbptr, struct ieee80211_wmm_rule *rule) +{ + unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2; + struct fwdb_collection *coll = (void *)((u8 *)db + ptr); + int i; + + for (i = 0; i < coll->n_rules; i++) { + __be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2)); + unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2; + struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr); + struct fwdb_wmm_rule *wmm; + unsigned int wmm_ptr; + + if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr)) + continue; + + if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) && + freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) { + wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2; + wmm = (void *)((u8 *)db + wmm_ptr); + set_wmm_rule(rule, wmm); + if (dbptr) + *dbptr = wmm_ptr; + return 0; + } + } + + return -ENODATA; +} + +int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr, + struct ieee80211_wmm_rule *rule) +{ + const struct fwdb_header *hdr = regdb; + const struct fwdb_country *country; + + if (IS_ERR(regdb)) + return PTR_ERR(regdb); + + country = &hdr->country[0]; + while (country->coll_ptr) { + if (alpha2_equal(alpha2, country->alpha2)) + return __regdb_query_wmm(regdb, country, freq, dbptr, + rule); + + country++; + } + + return -ENODATA; +} +EXPORT_SYMBOL(reg_query_regdb_wmm); + struct wmm_ptrs { struct ieee80211_wmm_rule *rule; u32 ptr; -- cgit v1.2.3-70-g09d2 From db3bdcb9c3ffc628c5284d7ed03a704295ba1214 Mon Sep 17 00:00:00 2001 From: Manikanta Pubbisetty Date: Wed, 28 Mar 2018 18:34:19 +0530 Subject: mac80211: allow AP_VLAN operation on crypto controlled devices In the current implementation, mac80211 advertises the support of AP_VLANs based on the driver's support for AP mode; it also blocks encrypted AP_VLAN operation on devices advertising SW_CRYPTO_CONTROL. The implementation seems weird in it's current form and could be often confusing, this is because there can be drivers advertising both SW_CRYPTO_CONTROL and AP mode support (ex: ath10k) in which case AP_VLAN will still be supported but only in open BSS and not in secured BSS. When SW_CRYPTO_CONTROL is enabled, it makes more sense if the decision to support AP_VLANs is left to the driver. Mac80211 can then allow AP_VLAN operations depending on the driver support. Signed-off-by: Manikanta Pubbisetty Signed-off-by: Johannes Berg --- net/mac80211/key.c | 8 +++++--- net/mac80211/main.c | 8 ++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index aee05ec3f7ea..ee0d0cc8dc3b 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -126,7 +126,7 @@ static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata, static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_sub_if_data *sdata = key->sdata; struct sta_info *sta; int ret = -EOPNOTSUPP; @@ -162,7 +162,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) if (sta && !sta->uploaded) goto out_unsupported; - sdata = key->sdata; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { /* * The driver doesn't know anything about VLAN interfaces. @@ -214,8 +213,11 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) /* all of these we can do in software - if driver can */ if (ret == 1) return 0; - if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) + if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) { + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + return 0; return -EINVAL; + } return 0; default: return -EINVAL; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0785d04a80bc..8d0333b5355b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -930,8 +930,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) IEEE80211_HT_CAP_SM_PS_SHIFT; } - /* if low-level driver supports AP, we also support VLAN */ - if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) { + /* if low-level driver supports AP, we also support VLAN. + * drivers advertising SW_CRYPTO_CONTROL should enable AP_VLAN + * based on their support to transmit SW encrypted packets. + */ + if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP) && + !ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL)) { hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN); hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN); } -- cgit v1.2.3-70-g09d2 From 4d191c75365a0067a9d5b8c8746b1bd9310c5a70 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 29 Mar 2018 11:14:30 +0200 Subject: mac80211: remove shadowing duplicated variable We already have 'ifmgd' here, and it's already assigned to the same value, so remove the duplicate. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a3bcf953d423..d2bc52046729 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2011,8 +2011,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, /* deauthenticate/disassociate now */ if (tx || frame_buf) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - /* * In multi channel scenarios guarantee that the virtual * interface is granted immediate airtime to transmit the -- cgit v1.2.3-70-g09d2 From 6a671a50f8199b3e1fe49fa8afff0fc8335da79c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:41 -0500 Subject: nl80211: Add CMD_CONTROL_PORT_FRAME API This commit also adds cfg80211_rx_control_port function. This is used to generate a CMD_CONTROL_PORT_FRAME event out to userspace. The conn_owner_nlportid is used as the unicast destination. This means that userspace must specify NL80211_ATTR_SOCKET_OWNER flag if control port over nl80211 routing is requested in NL80211_CMD_CONNECT, NL80211_CMD_ASSOCIATE, NL80211_CMD_START_AP or IBSS/mesh join. Signed-off-by: Denis Kenzior [johannes: fix return value of cfg80211_rx_control_port()] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 22 +++++++++++++++++ include/uapi/linux/nl80211.h | 13 ++++++++++ net/wireless/nl80211.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 21 ++++++++++++++++ 4 files changed, 114 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bfe174896fcf..df145f76adad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5721,6 +5721,28 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, const u8 *buf, size_t len, bool ack, gfp_t gfp); +/** + * cfg80211_rx_control_port - notification about a received control port frame + * @dev: The device the frame matched to + * @buf: control port frame + * @len: length of the frame data + * @addr: The peer from which the frame was received + * @proto: frame protocol, typically PAE or Pre-authentication + * @unencrypted: Whether the frame was received unencrypted + * + * This function is used to inform userspace about a received control port + * frame. It should only be used if userspace indicated it wants to receive + * control port frames over nl80211. + * + * The frame is the data portion of the 802.3 or 802.11 data frame with all + * network layer headers removed (e.g. the raw EAPoL frame). + * + * Return: %true if the frame was passed to userspace + */ +bool cfg80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted); + /** * cfg80211_cqm_rssi_notify - connection quality monitoring rssi event * @dev: network device diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9ea3d6039eca..6a3cc7a635b5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -990,6 +990,17 @@ * &NL80211_CMD_CONNECT or &NL80211_CMD_ROAM. If the 4 way handshake failed * &NL80211_CMD_DISCONNECT should be indicated instead. * + * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request + * and RX notification. This command is used both as a request to transmit + * a control port frame and as a notification that a control port frame + * has been received. %NL80211_ATTR_FRAME is used to specify the + * frame contents. The frame is the raw EAPoL data, without ethernet or + * 802.11 headers. + * When used as an event indication %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT and %NL80211_ATTR_MAC are added + * indicating the protocol type of the received frame; whether the frame + * was received unencrypted and the MAC address of the peer respectively. + * * @NL80211_CMD_RELOAD_REGDB: Request that the regdb firmware file is reloaded. * * @NL80211_CMD_EXTERNAL_AUTH: This interface is exclusively defined for host @@ -1228,6 +1239,8 @@ enum nl80211_commands { NL80211_CMD_STA_OPMODE_CHANGED, + NL80211_CMD_CONTROL_PORT_FRAME, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cfaf2aeb9783..0870447fbd55 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -14553,6 +14553,64 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, } EXPORT_SYMBOL(cfg80211_mgmt_tx_status); +static int __nl80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, + bool unencrypted, gfp_t gfp) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct sk_buff *msg; + void *hdr; + u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid); + + if (!nlportid) + return -ENOENT; + + msg = nlmsg_new(100 + len, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONTROL_PORT_FRAME); + if (!hdr) { + nlmsg_free(msg); + return -ENOBUFS; + } + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_FRAME, len, buf) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) || + (unencrypted && nla_put_flag(msg, + NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid); + + nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + +bool cfg80211_rx_control_port(struct net_device *dev, + const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted) +{ + int ret; + + trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted); + ret = __nl80211_rx_control_port(dev, buf, len, addr, proto, + unencrypted, GFP_ATOMIC); + trace_cfg80211_return_bool(ret == 0); + return ret == 0; +} +EXPORT_SYMBOL(cfg80211_rx_control_port); + static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev, const char *mac, gfp_t gfp) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 5152938b358d..42fd338f879e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2600,6 +2600,27 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_rx_control_port, + TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len, + const u8 *addr, u16 proto, bool unencrypted), + TP_ARGS(netdev, buf, len, addr, proto, unencrypted), + TP_STRUCT__entry( + NETDEV_ENTRY + MAC_ENTRY(addr) + __field(u16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + NETDEV_ASSIGN; + MAC_ASSIGN(addr, addr); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s", + NETDEV_PR_ARG, MAC_PR_ARG(addr), + __entry->proto, BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(cfg80211_cqm_rssi_notify, TP_PROTO(struct net_device *netdev, enum nl80211_cqm_rssi_threshold_event rssi_event, -- cgit v1.2.3-70-g09d2 From 2576a9ace47eba28a682d249d1d6402f891808c9 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:42 -0500 Subject: nl80211: Implement TX of control port frames This commit implements the TX side of NL80211_CMD_CONTROL_PORT_FRAME. Userspace provides the raw EAPoL frame using NL80211_ATTR_FRAME. Userspace should also provide the destination address and the protocol type to use when sending the frame. This is used to implement TX of Pre-authentication frames. If CONTROL_PORT_ETHERTYPE_NO_ENCRYPT is specified, then the driver will be asked not to encrypt the outgoing frame. A new EXT_FEATURE flag is introduced so that nl80211 code can check whether a given wiphy has capability to pass EAPoL frames over nl80211. Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 ++++++ include/uapi/linux/nl80211.h | 3 ++ net/wireless/nl80211.c | 71 +++++++++++++++++++++++++++++++++++++++++++- net/wireless/rdev-ops.h | 15 ++++++++++ net/wireless/trace.h | 26 ++++++++++++++++ 5 files changed, 123 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index df145f76adad..de2894a4ad10 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2961,6 +2961,9 @@ struct cfg80211_external_auth_params { * * @external_auth: indicates result of offloaded authentication processing from * user space + * + * @tx_control_port: TX a control port frame (EAPoL). The noencrypt parameter + * tells the driver that the frame should not be encrypted. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3256,6 +3259,12 @@ struct cfg80211_ops { const u8 *aa); int (*external_auth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_external_auth_params *params); + + int (*tx_control_port)(struct wiphy *wiphy, + struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, const __be16 proto, + const bool noencrypt); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6a3cc7a635b5..3167d6f7fc68 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5024,6 +5024,8 @@ enum nl80211_feature_flags { * channel change triggered by radar detection event. * No need to start CAC from user-space, no need to react to * "radar detected" event. + * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and + * receiving control port frames over nl80211 instead of the netdevice. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -5055,6 +5057,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_LOW_POWER_SCAN, NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, NL80211_EXT_FEATURE_DFS_OFFLOAD, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0870447fbd55..6eb286784924 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12535,6 +12535,68 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) return rdev_external_auth(rdev, dev, ¶ms); } +static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *buf; + size_t len; + u8 *dest; + u16 proto; + bool noencrypt; + int err; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + if (!rdev->ops->tx_control_port) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_FRAME] || + !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + GENL_SET_ERR_MSG(info, "Frame, MAC or ethertype missing"); + return -EINVAL; + } + + wdev_lock(wdev); + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + break; + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + wdev_unlock(wdev); + + buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); + len = nla_len(info->attrs[NL80211_ATTR_FRAME]); + dest = nla_data(info->attrs[NL80211_ATTR_MAC]); + proto = nla_get_u16(info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + noencrypt = + nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); + + return rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt); + + out: + wdev_unlock(wdev); + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13438,7 +13500,14 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, - + { + .cmd = NL80211_CMD_CONTROL_PORT_FRAME, + .doit = nl80211_tx_control_port, + .policy = nl80211_policy, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 84f23ae015fc..87479a53411b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -714,6 +714,21 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, + struct net_device *dev, + const void *buf, size_t len, + const u8 *dest, __be16 proto, + const bool noencrypt) +{ + int ret; + trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, + dest, proto, noencrypt); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 42fd338f879e..a64291ae52a6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1882,6 +1882,32 @@ TRACE_EVENT(rdev_mgmt_tx, BOOL_TO_STR(__entry->dont_wait_for_ack)) ); +TRACE_EVENT(rdev_tx_control_port, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *buf, size_t len, const u8 *dest, __be16 proto, + bool unencrypted), + TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dest) + __field(__be16, proto) + __field(bool, unencrypted) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dest, dest); + __entry->proto = proto; + __entry->unencrypted = unencrypted; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT "," + " proto: 0x%x, unencrypted: %s", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest), + be16_to_cpu(__entry->proto), + BOOL_TO_STR(__entry->unencrypted)) +); + TRACE_EVENT(rdev_set_noack_map, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u16 noack_map), -- cgit v1.2.3-70-g09d2 From 64bf3d4bc2b0725b3c5ffadd982a9746bfc738b7 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:43 -0500 Subject: nl80211: Add CONTROL_PORT_OVER_NL80211 attribute Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 14 +++++++++++++- net/wireless/nl80211.c | 26 ++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index de2894a4ad10..0bd957b37208 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -647,6 +647,8 @@ struct survey_info { * allowed through even on unauthorized ports * @control_port_no_encrypt: TRUE to prevent encryption of control port * protocol frames. + * @control_port_over_nl80211: TRUE if userspace expects to exchange control + * port frames over NL80211 instead of the network interface. * @wep_keys: static WEP keys, if not NULL points to an array of * CFG80211_MAX_WEP_KEYS WEP keys * @wep_tx_key: key index (0..3) of the default TX static WEP key @@ -662,6 +664,7 @@ struct cfg80211_crypto_settings { bool control_port; __be16 control_port_ethertype; bool control_port_no_encrypt; + bool control_port_over_nl80211; struct key_params *wep_keys; int wep_tx_key; const u8 *psk; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 3167d6f7fc68..15daf5e2638d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -542,7 +542,8 @@ * IEs in %NL80211_ATTR_IE, %NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_USE_MFP, * %NL80211_ATTR_MAC, %NL80211_ATTR_WIPHY_FREQ, %NL80211_ATTR_CONTROL_PORT, * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, - * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, %NL80211_ATTR_MAC_HINT, and + * %NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT, + * %NL80211_ATTR_CONTROL_PORT_OVER_NL80211, %NL80211_ATTR_MAC_HINT, and * %NL80211_ATTR_WIPHY_FREQ_HINT. * If included, %NL80211_ATTR_MAC and %NL80211_ATTR_WIPHY_FREQ are * restrictions on BSS selection, i.e., they effectively prevent roaming @@ -1488,6 +1489,15 @@ enum nl80211_commands { * @NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT: When included along with * %NL80211_ATTR_CONTROL_PORT_ETHERTYPE, indicates that the custom * ethertype frames used for key negotiation must not be encrypted. + * @NL80211_ATTR_CONTROL_PORT_OVER_NL80211: A flag indicating whether control + * port frames (e.g. of type given in %NL80211_ATTR_CONTROL_PORT_ETHERTYPE) + * will be sent directly to the network interface or sent via the NL80211 + * socket. If this attribute is missing, then legacy behavior of sending + * control port frames directly to the network interface is used. If the + * flag is included, then control port frames are sent over NL80211 instead + * using %CMD_CONTROL_PORT_FRAME. If control port routing over NL80211 is + * to be used then userspace must also use the %NL80211_ATTR_SOCKET_OWNER + * flag. * * @NL80211_ATTR_TESTDATA: Testmode data blob, passed through to the driver. * We recommend using nested, driver-specific attributes within this. @@ -2647,6 +2657,8 @@ enum nl80211_attrs { NL80211_ATTR_NSS, NL80211_ATTR_ACK_SIGNAL, + NL80211_ATTR_CONTROL_PORT_OVER_NL80211, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 6eb286784924..d3b14d9d002a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -287,6 +287,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, @@ -8211,6 +8212,22 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return err; } +static int validate_pae_over_nl80211(struct cfg80211_registered_device *rdev, + struct genl_info *info) +{ + if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) { + GENL_SET_ERR_MSG(info, "SOCKET_OWNER not set"); + return -EINVAL; + } + + if (!rdev->ops->tx_control_port || + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211)) + return -EOPNOTSUPP; + + return 0; +} + static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_crypto_settings *settings, @@ -8234,6 +8251,15 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, } else settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + settings->control_port_over_nl80211 = true; + } + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { void *data; int len, i; -- cgit v1.2.3-70-g09d2 From c3bfe1f6fc98e7185ff5ee9279ba259fe484597c Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:48 -0500 Subject: nl80211: Add control_port_over_nl80211 for ibss Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/nl80211.c | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0bd957b37208..ed2773f8558e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2034,6 +2034,8 @@ struct cfg80211_disassoc_request { * sets/clears %NL80211_STA_FLAG_AUTHORIZED. If true, the driver is * required to assume that the port is unauthorized until authorized by * user space. Otherwise, port is marked authorized by default. + * @control_port_over_nl80211: TRUE if userspace expects to exchange control + * port frames over NL80211 instead of the network interface. * @userspace_handles_dfs: whether user space controls DFS operation, i.e. * changes the channel when a radar is detected. This is required * to operate on DFS channels. @@ -2057,6 +2059,7 @@ struct cfg80211_ibss_params { bool channel_fixed; bool privacy; bool control_port; + bool control_port_over_nl80211; bool userspace_handles_dfs; int mcast_rate[NUM_NL80211_BANDS]; struct ieee80211_ht_cap ht_capa; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d3b14d9d002a..f8e10408f2b3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -8705,6 +8705,15 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) ibss.control_port = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + ibss.control_port_over_nl80211 = true; + } + ibss.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); -- cgit v1.2.3-70-g09d2 From 1224f5831a22977f30c1842874be12c58608cee7 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:49 -0500 Subject: nl80211: Add control_port_over_nl80211 to mesh_setup Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ net/wireless/nl80211.c | 9 +++++++++ 2 files changed, 12 insertions(+) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ed2773f8558e..250dac390806 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1454,6 +1454,8 @@ struct mesh_config { * @userspace_handles_dfs: whether user space controls DFS operation, i.e. * changes the channel when a radar is detected. This is required * to operate on DFS channels. + * @control_port_over_nl80211: TRUE if userspace expects to exchange control + * port frames over NL80211 instead of the network interface. * * These parameters are fixed when the mesh is created. */ @@ -1476,6 +1478,7 @@ struct mesh_setup { u32 basic_rates; struct cfg80211_bitrate_mask beacon_rate; bool userspace_handles_dfs; + bool control_port_over_nl80211; }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f8e10408f2b3..ff28f8feeb09 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10168,6 +10168,15 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) setup.userspace_handles_dfs = nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]); + if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) { + int r = validate_pae_over_nl80211(rdev, info); + + if (r < 0) + return r; + + setup.control_port_over_nl80211 = true; + } + wdev_lock(dev->ieee80211_ptr); err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg); if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER]) -- cgit v1.2.3-70-g09d2 From 911806491425d79107cadddbde11b42bbdfe38c8 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:50 -0500 Subject: mac80211: Add support for tx_control_port Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 1 + net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/tx.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 36d128ebbac8..f6b8d59a7ee8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3791,4 +3791,5 @@ const struct cfg80211_ops mac80211_config_ops = { .add_nan_func = ieee80211_add_nan_func, .del_nan_func = ieee80211_del_nan_func, .set_multicast_to_unicast = ieee80211_set_multicast_to_unicast, + .tx_control_port = ieee80211_tx_control_port, }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 5b3d78362fb5..275d6269b4a7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1735,6 +1735,9 @@ void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); +int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, __be16 proto, bool unencrypted); /* HT */ void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 933c67b5f845..535de3161a78 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4757,3 +4757,49 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, ieee80211_xmit(sdata, NULL, skb); local_bh_enable(); } + +int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len, + const u8 *dest, __be16 proto, bool unencrypted) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sk_buff *skb; + struct ethhdr *ehdr; + u32 flags; + + /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE + * or Pre-Authentication + */ + if (proto != sdata->control_port_protocol && + proto != cpu_to_be16(ETH_P_PREAUTH)) + return -EINVAL; + + if (unencrypted) + flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; + else + flags = 0; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + sizeof(struct ethhdr) + len); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr)); + + skb_put_data(skb, buf, len); + + ehdr = skb_push(skb, sizeof(struct ethhdr)); + memcpy(ehdr->h_dest, dest, ETH_ALEN); + memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN); + ehdr->h_proto = proto; + + skb->dev = dev; + skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + + __ieee80211_subif_start_xmit(skb, skb->dev, flags); + + return 0; +} -- cgit v1.2.3-70-g09d2 From 018f6fbf540d7bd7223b7d0b29651c1dd5e1c606 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Mon, 26 Mar 2018 12:52:51 -0500 Subject: mac80211: Send control port frames over nl80211 If userspace requested control port frames to go over 80211, then do so. The control packets are intercepted just prior to delivery of the packet to the underlying network device. Pre-authentication type frames (protocol: 0x88c7) are also forwarded over nl80211. Signed-off-by: Denis Kenzior Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 ++++++ net/mac80211/ibss.c | 1 + net/mac80211/ieee80211_i.h | 1 + net/mac80211/iface.c | 2 ++ net/mac80211/main.c | 2 ++ net/mac80211/mlme.c | 2 ++ net/mac80211/rx.c | 33 ++++++++++++++++++++++++++++----- 7 files changed, 42 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f6b8d59a7ee8..85dbaa891059 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -926,6 +926,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, */ sdata->control_port_protocol = params->crypto.control_port_ethertype; sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt; + sdata->control_port_over_nl80211 = + params->crypto.control_port_over_nl80211; sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, sdata->vif.type); @@ -935,6 +937,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, params->crypto.control_port_ethertype; vlan->control_port_no_encrypt = params->crypto.control_port_no_encrypt; + vlan->control_port_over_nl80211 = + params->crypto.control_port_over_nl80211; vlan->encrypt_headroom = ieee80211_cs_headroom(sdata->local, ¶ms->crypto, @@ -2020,6 +2024,8 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev, if (err) return err; + sdata->control_port_over_nl80211 = setup->control_port_over_nl80211; + /* can mesh use other SMPS modes? */ sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = sdata->local->rx_chains; diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index dc582aa35c89..6449a1c2283b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -1844,6 +1844,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->smps_mode = IEEE80211_SMPS_OFF; sdata->needed_rx_chains = local->rx_chains; + sdata->control_port_over_nl80211 = params->control_port_over_nl80211; ieee80211_queue_work(&local->hw, &sdata->work); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 275d6269b4a7..6372dbdadf53 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -900,6 +900,7 @@ struct ieee80211_sub_if_data { u16 sequence_number; __be16 control_port_protocol; bool control_port_no_encrypt; + bool control_port_over_nl80211; int encrypt_headroom; atomic_t num_tx_queued; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d13ba064951f..555e389b7dfa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -519,6 +519,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) master->control_port_protocol; sdata->control_port_no_encrypt = master->control_port_no_encrypt; + sdata->control_port_over_nl80211 = + master->control_port_over_nl80211; sdata->vif.cab_queue = master->vif.cab_queue; memcpy(sdata->vif.hw_queue, master->vif.hw_queue, sizeof(sdata->vif.hw_queue)); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 8d0333b5355b..9ea17afaa237 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -554,6 +554,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_FEATURE_USERSPACE_MPM | NL80211_FEATURE_FULL_AP_CLIENT_STATE; wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); if (!ops->hw_scan) wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d2bc52046729..20d2b186d740 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4855,6 +4855,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->control_port_protocol = req->crypto.control_port_ethertype; sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; + sdata->control_port_over_nl80211 = + req->crypto.control_port_over_nl80211; sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto, sdata->vif.type); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3a9f0c0a2de8..03102aff0953 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2245,6 +2245,32 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) return true; } +static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb, + struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct net_device *dev = sdata->dev; + + if (unlikely((skb->protocol == sdata->control_port_protocol || + skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) && + sdata->control_port_over_nl80211)) { + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + bool noencrypt = status->flag & RX_FLAG_DECRYPTED; + struct ethhdr *ehdr = eth_hdr(skb); + + cfg80211_rx_control_port(dev, skb->data, skb->len, + ehdr->h_source, + be16_to_cpu(skb->protocol), noencrypt); + dev_kfree_skb(skb); + } else { + /* deliver to local stack */ + if (rx->napi) + napi_gro_receive(rx->napi, skb); + else + netif_receive_skb(skb); + } +} + /* * requires that rx->skb is a frame with ethernet header */ @@ -2329,13 +2355,10 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) #endif if (skb) { - /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (rx->napi) - napi_gro_receive(rx->napi, skb); - else - netif_receive_skb(skb); + + ieee80211_deliver_skb_to_local_stack(skb, rx); } if (xmit_skb) { -- cgit v1.2.3-70-g09d2 From c470bdc1aaf36669e04ba65faf1092b2d1c6cabe Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 26 Mar 2018 16:21:04 +0300 Subject: mac80211: don't WARN on bad WMM parameters from buggy APs Apparently, some APs are buggy enough to send a zeroed WMM IE. Don't WARN on this since this is not caused by a bug on the client's system. This aligns the condition of the WARNING in drv_conf_tx with the validity check in ieee80211_sta_wmm_params. We will now pick the default values whenever we get a zeroed WMM IE. This has been reported here: https://bugzilla.kernel.org/show_bug.cgi?id=199161 Fixes: f409079bb678 ("mac80211: sanity check CW_min/CW_max towards driver") Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 20d2b186d740..07b58d20e89c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1786,7 +1786,8 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local, params[ac].acm = acm; params[ac].uapsd = uapsd; - if (params[ac].cw_min > params[ac].cw_max) { + if (params->cw_min == 0 || + params[ac].cw_min > params[ac].cw_max) { sdata_info(sdata, "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n", params[ac].cw_min, params[ac].cw_max, aci); -- cgit v1.2.3-70-g09d2 From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:32 +0300 Subject: net: Introduce net_rwsem to protect net_namespace_list rtnl_lock() is used everywhere, and contention is very high. When someone wants to iterate over alive net namespaces, he/she has no a possibility to do that without exclusive lock. But the exclusive rtnl_lock() in such places is overkill, and it just increases the contention. Yes, there is already for_each_net_rcu() in kernel, but it requires rcu_read_lock(), and this can't be sleepable. Also, sometimes it may be need really prevent net_namespace_list growth, so for_each_net_rcu() is not fit there. This patch introduces new rw_semaphore, which will be used instead of rtnl_mutex to protect net_namespace_list. It is sleepable and allows not-exclusive iterations over net namespaces list. It allows to stop using rtnl_lock() in several places (what is made in next patches) and makes less the time, we keep rtnl_mutex. Here we just add new lock, while the explanation of we can remove rtnl_lock() there are in next patches. Fine grained locks generally are better, then one big lock, so let's do that with net_namespace_list, while the situation allows that. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/infiniband/core/roce_gid_mgmt.c | 2 ++ include/linux/rtnetlink.h | 1 + include/net/net_namespace.h | 1 + net/core/dev.c | 5 +++++ net/core/fib_notifier.c | 2 ++ net/core/net_namespace.c | 18 +++++++++++++----- net/core/rtnetlink.c | 5 +++++ net/netfilter/nf_conntrack_core.c | 2 ++ net/openvswitch/datapath.c | 2 ++ net/wireless/wext-core.c | 2 ++ security/selinux/include/xfrm.h | 2 ++ 11 files changed, 37 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 5a52ec77940a..cc2966380c0c 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -403,10 +403,12 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, * our feet */ rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) add_netdev_ips(ib_dev, port, rdma_ndev, ndev); + up_read(&net_rwsem); rtnl_unlock(); } diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; +extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1ab4f920f109..47e35cce3b64 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -291,6 +291,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet) #endif } +/* Protected by net_rwsem */ #define for_each_net(VAR) \ list_for_each_entry(VAR, &net_namespace_list, list) diff --git a/net/core/dev.c b/net/core/dev.c index e13807b5c84d..eca5458b2753 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; if (dev_boot_phase) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); @@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UP, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); @@ -1664,6 +1666,7 @@ rollback: } outroll: + up_read(&net_rwsem); raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; + down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { @@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } + up_read(&net_rwsem); unlock: rtnl_unlock(); return err; diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 0c048bdeb016..614b985c92a4 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -33,6 +33,7 @@ static unsigned int fib_seq_sum(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { rcu_read_lock(); list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) { @@ -43,6 +44,7 @@ static unsigned int fib_seq_sum(void) } rcu_read_unlock(); } + up_read(&net_rwsem); rtnl_unlock(); return fib_seq; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5796d17a302..7fdf321d4997 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -33,6 +33,10 @@ static struct list_head *first_device = &pernet_list; LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); +/* Protects net_namespace_list. Nests iside rtnl_lock() */ +DECLARE_RWSEM(net_rwsem); +EXPORT_SYMBOL_GPL(net_rwsem); + struct net init_net = { .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), @@ -309,9 +313,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) if (error < 0) goto out_undo; } - rtnl_lock(); + down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); - rtnl_unlock(); + up_write(&net_rwsem); out: return error; @@ -450,7 +454,7 @@ static void unhash_nsid(struct net *net, struct net *last) * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below * is executing, the list may only grow. Thus, we do not - * use for_each_net_rcu() or rtnl_lock(). + * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { int id; @@ -485,7 +489,7 @@ static void cleanup_net(struct work_struct *work) down_read(&pernet_ops_rwsem); /* Don't let anyone else find us. */ - rtnl_lock(); + down_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) list_del_rcu(&net->list); /* Cache last net. After we unlock rtnl, no one new net @@ -499,7 +503,7 @@ static void cleanup_net(struct work_struct *work) * useless anyway, as netns_ids are destroyed there. */ last = list_last_entry(&net_namespace_list, struct net, list); - rtnl_unlock(); + up_write(&net_rwsem); llist_for_each_entry(net, net_kill_list, cleanup_list) { unhash_nsid(net, last); @@ -900,6 +904,9 @@ static int __register_pernet_operations(struct list_head *list, list_add_tail(&ops->list, list); if (ops->init || (ops->id && ops->size)) { + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { error = ops_init(ops, net); if (error) @@ -923,6 +930,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) LIST_HEAD(net_exit_list); list_del(&ops->list); + /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); ops_exit_list(ops, &net_exit_list); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2d3949789cef..e86b28482ca7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -418,9 +418,11 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; + down_read(&net_rwsem); for_each_net(net) { __rtnl_kill_links(net, ops); } + up_read(&net_rwsem); list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -438,6 +440,9 @@ static void rtnl_lock_unregistering_all(void) for (;;) { unregistering = false; rtnl_lock(); + /* We held write locked pernet_ops_rwsem, and parallel + * setup_net() and cleanup_net() are not possible. + */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 705198de671d..370f9b7f051b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1764,12 +1764,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); } + up_read(&net_rwsem); rtnl_unlock(); /* Need to wait for netns cleanup worker to finish, if its diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ef38e5aecd28..9746ee30a99b 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2364,8 +2364,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) __dp_destroy(dp); rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); + up_read(&net_rwsem); rtnl_unlock(); /* Detach all vports from given namespace. */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 9efbfc753347..544d7b62d7ca 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -349,11 +349,13 @@ void wireless_nlevent_flush(void) ASSERT_RTNL(); + down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } + up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index 1f173a7a4daa..31d66431be1e 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -48,8 +48,10 @@ static inline void selinux_xfrm_notify_policyload(void) struct net *net; rtnl_lock(); + down_read(&net_rwsem); for_each_net(net) rt_genid_bump_all(net); + up_read(&net_rwsem); rtnl_unlock(); } #else -- cgit v1.2.3-70-g09d2 From 10256debb918aea083d0ddada64d29014c642a7b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:44 +0300 Subject: net: Don't take rtnl_lock() in wireless_nlevent_flush() This function iterates over net_namespace_list and flushes the queue for every of them. What does this rtnl_lock() protects?! Since we may add skbs to net::wext_nlevents without rtnl_lock(), it does not protects us about queuers. It guarantees, two threads can't flush the queue in parallel, that can change the order, but since skb can be queued in any order, it doesn't matter, how many threads do this in parallel. In case of several threads, this will be even faster. So, we can remove rtnl_lock() here, as it was used for iteration over net_namespace_list only. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/wireless/wext-core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 544d7b62d7ca..5e677dac2a0c 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -347,8 +347,6 @@ void wireless_nlevent_flush(void) struct sk_buff *skb; struct net *net; - ASSERT_RTNL(); - down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) @@ -412,9 +410,7 @@ subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { - rtnl_lock(); wireless_nlevent_flush(); - rtnl_unlock(); } static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); -- cgit v1.2.3-70-g09d2 From ec9c780925c57588637e1dbd8650d294107311c0 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:21:09 +0300 Subject: ovs: Remove rtnl_lock() from ovs_exit_net() Here we iterate for_each_net() and removes vport from alive net to the exiting net. ovs_net::dps are protected by ovs_mutex(), and the others, who change it (ovs_dp_cmd_new(), __dp_destroy()) also take it. The same with datapath::ports list. So, we remove rtnl_lock() here. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9746ee30a99b..015e24e08909 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2363,12 +2363,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); - rtnl_lock(); down_read(&net_rwsem); for_each_net(net) list_vports_from_net(net, dnet, &head); up_read(&net_rwsem); - rtnl_unlock(); /* Detach all vports from given namespace. */ list_for_each_entry_safe(vport, vport_next, &head, detach_list) { -- cgit v1.2.3-70-g09d2 From 152f253152cc08f5d26ba76659723d2d83b363f8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:21:20 +0300 Subject: net: Remove rtnl_lock() in nf_ct_iterate_destroy() rtnl_lock() doesn't protect net::ct::count, and it's not needed for__nf_ct_unconfirmed_destroy() and for nf_queue_nf_hook_drop(). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 370f9b7f051b..41ff04ee2554 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1763,7 +1763,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) { struct net *net; - rtnl_lock(); down_read(&net_rwsem); for_each_net(net) { if (atomic_read(&net->ct.count) == 0) @@ -1772,7 +1771,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) nf_queue_nf_hook_drop(net); } up_read(&net_rwsem); - rtnl_unlock(); /* Need to wait for netns cleanup worker to finish, if its * running -- it might have deleted a net namespace from -- cgit v1.2.3-70-g09d2 From c30d9356e9e8ed0735c1215e187b03d3ae8b4966 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:55 -0700 Subject: net: Fix fib notifer to return errno Notifier handlers use notifier_from_errno to convert any potential error to an encoded format. As a consequence the other side, call_fib_notifier{s} in this case, needs to use notifier_to_errno to return the error from the handler back to its caller. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/fib_notifier.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c index 614b985c92a4..13a40b831d6d 100644 --- a/net/core/fib_notifier.c +++ b/net/core/fib_notifier.c @@ -13,16 +13,22 @@ int call_fib_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + int err; + info->net = net; - return nb->notifier_call(nb, event_type, info); + err = nb->notifier_call(nb, event_type, info); + return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + int err; + info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); + err = atomic_notifier_call_chain(&fib_chain, event_type, info); + return notifier_to_errno(err); } EXPORT_SYMBOL(call_fib_notifiers); -- cgit v1.2.3-70-g09d2 From 9776d32537d2bbc251fd1de651e2bb2439474bde Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:56 -0700 Subject: net: Move call_fib_rule_notifiers up in fib_nl_newrule Move call_fib_rule_notifiers up in fib_nl_newrule to the point right before the rule is inserted into the list. At this point there are no more failure paths within the core rule code, so if the notifier does not fail then the rule will be inserted into the list. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/core/fib_rules.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9d87ce868402..33958f84c173 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -631,6 +631,11 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout_free; + err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, + extack); + if (err < 0) + goto errout_free; + list_for_each_entry(r, &ops->rules_list, list) { if (r->pref > rule->pref) break; @@ -667,7 +672,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack); notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); -- cgit v1.2.3-70-g09d2 From 6635f311eab40b6d97eb884f371be41d0f5a3ed6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:57 -0700 Subject: net/ipv4: Move call_fib_entry_notifiers up for new routes Move call to call_fib_entry_notifiers for new IPv4 routes to right before the call to fib_insert_alias. At this point the only remaining failure path is memory allocations in fib_insert_node. Handle that very unlikely failure with a call to call_fib_entry_notifiers to tell drivers about it. At this point notifier handlers can decide the fate of the new route with a clean path to delete the potential new entry if the notifier returns non-0. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index fac0b73e24d1..67116233e2bc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1065,6 +1065,9 @@ noleaf: return -ENOMEM; } +/* fib notifier for ADD is sent before calling fib_insert_alias with + * the expectation that the only possible failure ENOMEM + */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1263,21 +1266,32 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); + if (err) + goto out_free_new_fa; + /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_free_new_fa; + goto out_fib_notif; if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; +out_fib_notif: + /* notifier was sent that entry would be added to trie, but + * the add failed and need to recover. Only failure for + * fib_insert_alias is ENOMEM. + */ + NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); + call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, + plen, new_fa, NULL); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: -- cgit v1.2.3-70-g09d2 From c1d7ee67acb54b7dc1408929ff70dfe46993e517 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:58 -0700 Subject: net/ipv4: Allow notifier to fail route replace Add checking to call to call_fib_entry_notifiers for IPv4 route replace. Allows a notifier handler to fail the replace. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 67116233e2bc..3dcffd3ce98c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1219,8 +1219,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, extack); + err = call_fib_entry_notifiers(net, + FIB_EVENT_ENTRY_REPLACE, + key, plen, new_fa, + extack); + if (err) + goto out_free_new_fa; + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); -- cgit v1.2.3-70-g09d2 From 2233000cba40ee0784a2d5b5e2b2c38c1159a7ef Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 27 Mar 2018 18:21:59 -0700 Subject: net/ipv6: Move call_fib6_entry_notifiers up for route adds Move call to call_fib6_entry_notifiers for new IPv6 routes to right before the insertion into the FIB. At this point notifier handlers can decide the fate of the new route with a clean path to delete the potential new entry if the notifier returns non-0. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 908b8e5b615a..deab2db6692e 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1007,12 +1007,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + rcu_assign_pointer(rt->rt6_next, iter); atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -1036,12 +1040,16 @@ add: if (err) return err; + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->rt6_next = iter->rt6_next; rcu_assign_pointer(*ins, rt); - call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, - rt, extack); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { -- cgit v1.2.3-70-g09d2 From 18dcbe12fe9fca0ab825f7eff993060525ac2503 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Mar 2018 19:49:42 -0700 Subject: ipv6: export ip6 fragments sysctl to unprivileged users IPv4 was changed in commit 52a773d645e9 ("net: Export ip fragment sysctl to unprivileged users") The only sysctl that is not per-netns is not used : ip6frag_secret_interval Signed-off-by: Eric Dumazet Cc: Nikolay Borisov Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index afbc000ad4f2..08a139f14d0f 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -650,10 +650,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) table[1].data = &net->ipv6.frags.low_thresh; table[1].extra2 = &net->ipv6.frags.high_thresh; table[2].data = &net->ipv6.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; } hdr = register_net_sysctl(net, "net/ipv6", table); -- cgit v1.2.3-70-g09d2 From 7ae665f132a62e67ccef1ef0994acba51abc2400 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Mar 2018 16:14:56 +0200 Subject: sctp: fix unused lable warning The proc file cleanup left a label possibly unused: net/sctp/protocol.c: In function 'sctp_defaults_init': net/sctp/protocol.c:1304:1: error: label 'err_init_proc' defined but not used [-Werror=unused-label] This adds an #ifdef around it to match the respective 'goto'. Fixes: d47d08c8ca05 ("sctp: use proc_remove_subtree()") Signed-off-by: Arnd Bergmann Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/protocol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 84a09f599131..a24cde236330 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1258,8 +1258,10 @@ static int __net_init sctp_defaults_init(struct net *net) return 0; +#ifdef CONFIG_PROC_FS err_init_proc: cleanup_sctp_mibs(net); +#endif err_init_mibs: sctp_sysctl_net_unregister(net); err_sysctl_register: -- cgit v1.2.3-70-g09d2 From 8934ce2fd08171e8605f7fada91ee7619fe17ab8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:15 -0700 Subject: bpf: sockmap redirect ingress support Add support for the BPF_F_INGRESS flag in sk_msg redirect helper. To do this add a scatterlist ring for receiving socks to check before calling into regular recvmsg call path. Additionally, because the poll wakeup logic only checked the skb recv queue we need to add a hook in TCP stack (similar to write side) so that we have a way to wake up polling socks when a scatterlist is redirected to that sock. After this all that is needed is for the redirect helper to push the scatterlist into the psock receive queue. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/net/sock.h | 1 + kernel/bpf/sockmap.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 10 ++- 5 files changed, 207 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index c2f167db8bd5..961cc5d53956 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct list_head list; }; /* Compute the linear packet data range [data, data_end) which diff --git a/include/net/sock.h b/include/net/sock.h index 709311132d4c..b8ff435fa96e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1085,6 +1085,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 69c5bccabd22..402e15466e9f 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -82,6 +84,7 @@ struct smap_psock { int sg_size; int eval; struct sk_msg_buff *cork; + struct list_head ingress; struct strparser strp; struct bpf_prog *bpf_tx_msg; @@ -103,6 +106,8 @@ struct smap_psock { }; static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); @@ -112,6 +117,21 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + static struct proto tcp_bpf_proto; static int bpf_tcp_init(struct sock *sk) { @@ -135,6 +155,8 @@ static int bpf_tcp_init(struct sock *sk) if (psock->bpf_tx_msg) { tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; } sk->sk_prot = &tcp_bpf_proto; @@ -170,6 +192,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -188,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { @@ -468,6 +497,72 @@ verdict: return _rc; } +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) @@ -475,6 +570,7 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct smap_psock *psock; struct scatterlist *sg; int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); sg = md->sg_data; @@ -487,9 +583,14 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, goto out_rcu; rcu_read_unlock(); - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } smap_release_sock(psock, sk); if (unlikely(err)) goto out; @@ -623,6 +724,89 @@ out_err: return err; } +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; @@ -1107,6 +1291,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -1131,6 +1316,12 @@ static void smap_gc_work(struct work_struct *w) kfree(psock->cork); } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); @@ -1160,6 +1351,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); + INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); diff --git a/net/core/filter.c b/net/core/filter.c index afd825534ac4..a5a995e5b380 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1894,7 +1894,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->key = key; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c31be306572..bccc4c270087 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -554,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { -- cgit v1.2.3-70-g09d2 From fa246693a111fab32bd51d20f07a347e42773ee9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:25 -0700 Subject: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT: Add support for the BPF_F_INGRESS flag in skb redirect helper. To do this convert skb into a scatterlist and push into ingress queue. This is the same logic that is used in the sk_msg redirect helper so it should feel familiar. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + kernel/bpf/sockmap.c | 94 ++++++++++++++++++++++++++++++++++++++++---------- net/core/filter.c | 2 +- 3 files changed, 78 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 961cc5d53956..897ff3d95968 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; struct list_head list; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 402e15466e9f..9192fdbcbccc 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -785,7 +785,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, i++; if (i == MAX_SKB_FRAGS) i = 0; - put_page(page); + if (!md->skb) + put_page(page); } if (copied == len) break; @@ -794,6 +795,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); + if (md->skb) + consume_skb(md->skb); kfree(md); } } @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -1127,15 +1175,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -1153,7 +1209,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); diff --git a/net/core/filter.c b/net/core/filter.c index a5a995e5b380..e989bf313195 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1855,7 +1855,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key; -- cgit v1.2.3-70-g09d2 From 39c202d228c3da5a5531be847e9b06cc9b787f31 Mon Sep 17 00:00:00 2001 From: Bernie Harris Date: Wed, 21 Mar 2018 15:42:15 +1300 Subject: netfilter: ebtables: Add support for specifying match revision Currently ebtables assumes that the revision number of all match modules is 0, which is an issue when trying to use existing xtables matches with ebtables. The solution is to modify ebtables to allow extensions to specify a revision number, similar to iptables. This gets passed down to the kernel, which is then able to find the match module correctly. To main binary backwards compatibility, the size of the ebt_entry structures is not changed, only the size of the name field is decreased by 1 byte to make room for the revision field. Signed-off-by: Bernie Harris Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_bridge/ebtables.h | 16 +++++++-- net/bridge/netfilter/ebtables.c | 47 ++++++++++++++++---------- 2 files changed, 42 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 9ff57c0a0199..0c7dc8315013 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -20,6 +20,7 @@ #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_FUNCTION_MAXNAMELEN EBT_TABLE_MAXNAMELEN +#define EBT_EXTENSION_MAXNAMELEN 31 /* verdicts >0 are "branches" */ #define EBT_ACCEPT -1 @@ -120,7 +121,10 @@ struct ebt_entries { struct ebt_entry_match { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_match *match; } u; /* size of data */ @@ -130,7 +134,10 @@ struct ebt_entry_match { struct ebt_entry_watcher { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_target *watcher; } u; /* size of data */ @@ -140,7 +147,10 @@ struct ebt_entry_watcher { struct ebt_entry_target { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + uint8_t revision; + }; struct xt_target *target; } u; /* size of data */ diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9a26d2b7420f..a8cb543e3296 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -356,12 +356,12 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par, left - sizeof(struct ebt_entry_match) < m->match_size) return -EINVAL; - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) { if (!IS_ERR(match)) module_put(match->me); request_module("ebt_%s", m->u.name); - match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0); + match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision); } if (IS_ERR(match)) return PTR_ERR(match); @@ -1350,16 +1350,17 @@ static int update_counters(struct net *net, const void __user *user, static inline int ebt_obj_to_user(char __user *um, const char *_name, const char *data, int entrysize, - int usersize, int datasize) + int usersize, int datasize, u8 revision) { - char name[EBT_FUNCTION_MAXNAMELEN] = {0}; + char name[EBT_EXTENSION_MAXNAMELEN] = {0}; - /* ebtables expects 32 bytes long names but xt_match names are 29 bytes + /* ebtables expects 31 bytes long names but xt_match names are 29 bytes * long. Copy 29 bytes and fill remaining bytes with zeroes. */ strlcpy(name, _name, sizeof(name)); - if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) || - put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) || + if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) || + put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) || + put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) || xt_data_to_user(um + entrysize, data, usersize, datasize, XT_ALIGN(datasize))) return -EFAULT; @@ -1372,7 +1373,8 @@ static inline int ebt_match_to_user(const struct ebt_entry_match *m, { return ebt_obj_to_user(ubase + ((char *)m - base), m->u.match->name, m->data, sizeof(*m), - m->u.match->usersize, m->match_size); + m->u.match->usersize, m->match_size, + m->u.match->revision); } static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, @@ -1380,7 +1382,8 @@ static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w, { return ebt_obj_to_user(ubase + ((char *)w - base), w->u.watcher->name, w->data, sizeof(*w), - w->u.watcher->usersize, w->watcher_size); + w->u.watcher->usersize, w->watcher_size, + w->u.watcher->revision); } static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, @@ -1411,7 +1414,8 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base, if (ret != 0) return ret; ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t), - t->u.target->usersize, t->target_size); + t->u.target->usersize, t->target_size, + t->u.target->revision); if (ret != 0) return ret; @@ -1599,7 +1603,10 @@ struct compat_ebt_replace { /* struct ebt_entry_match, _target and _watcher have same layout */ struct compat_ebt_entry_mwt { union { - char name[EBT_FUNCTION_MAXNAMELEN]; + struct { + char name[EBT_EXTENSION_MAXNAMELEN]; + u8 revision; + }; compat_uptr_t ptr; } u; compat_uint_t match_size; @@ -1638,8 +1645,9 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr, BUG_ON(off >= m->match_size); - if (copy_to_user(cm->u.name, match->name, - strlen(match->name) + 1) || put_user(msize, &cm->match_size)) + if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) || + put_user(match->revision, &cm->u.revision) || + put_user(msize, &cm->match_size)) return -EFAULT; if (match->compat_to_user) { @@ -1668,8 +1676,9 @@ static int compat_target_to_user(struct ebt_entry_target *t, BUG_ON(off >= t->target_size); - if (copy_to_user(cm->u.name, target->name, - strlen(target->name) + 1) || put_user(tsize, &cm->match_size)) + if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) || + put_user(target->revision, &cm->u.revision) || + put_user(tsize, &cm->match_size)) return -EFAULT; if (target->compat_to_user) { @@ -1933,7 +1942,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct ebt_entries_buf_state *state, const unsigned char *base) { - char name[EBT_FUNCTION_MAXNAMELEN]; + char name[EBT_EXTENSION_MAXNAMELEN]; struct xt_match *match; struct xt_target *wt; void *dst = NULL; @@ -1947,7 +1956,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, switch (compat_mwt) { case EBT_COMPAT_MATCH: - match = xt_request_find_match(NFPROTO_BRIDGE, name, 0); + match = xt_request_find_match(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(match)) return PTR_ERR(match); @@ -1966,7 +1976,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; case EBT_COMPAT_WATCHER: /* fallthrough */ case EBT_COMPAT_TARGET: - wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0); + wt = xt_request_find_target(NFPROTO_BRIDGE, name, + mwt->u.revision); if (IS_ERR(wt)) return PTR_ERR(wt); off = xt_compat_target_offset(wt); -- cgit v1.2.3-70-g09d2 From 1be3ac98444066a292de02c2c12e203bdf575e7d Mon Sep 17 00:00:00 2001 From: Bernie Harris Date: Wed, 21 Mar 2018 15:42:16 +1300 Subject: netfilter: ebtables: Add string filter This patch is part of a proposal to add a string filter to ebtables, which would be similar to the string filter in iptables. Like iptables, the ebtables filter uses the xt_string module. Signed-off-by: Bernie Harris Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_string.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 423293ee57c2..be1feddadcf0 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -21,6 +21,7 @@ MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); +MODULE_ALIAS("ebt_string"); static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) -- cgit v1.2.3-70-g09d2 From 9124a20d8794663a396b5d6f91f66903848a042b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 21 Mar 2018 04:03:22 -0700 Subject: netfilter: ebt_stp: Use generic functions for comparisons Instead of unnecessary const declarations, use the generic functions to save a little object space. $ size net/bridge/netfilter/ebt_stp.o* text data bss dec hex filename 1250 144 0 1394 572 net/bridge/netfilter/ebt_stp.o.new 1344 144 0 1488 5d0 net/bridge/netfilter/ebt_stp.o.old Signed-off-by: Joe Perches Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_stp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c index 3140eb912d7e..47ba98db145d 100644 --- a/net/bridge/netfilter/ebt_stp.c +++ b/net/bridge/netfilter/ebt_stp.c @@ -153,8 +153,6 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par) static int ebt_stp_mt_check(const struct xt_mtchk_param *par) { const struct ebt_stp_info *info = par->matchinfo; - const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00}; - const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; const struct ebt_entry *e = par->entryinfo; if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || @@ -162,8 +160,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par) return -EINVAL; /* Make sure the match only receives stp frames */ if (!par->nft_compat && - (!ether_addr_equal(e->destmac, bridge_ula) || - !ether_addr_equal(e->destmsk, msk) || + (!ether_addr_equal(e->destmac, eth_stp_addr) || + !is_broadcast_ether_addr(e->destmsk) || !(e->bitmask & EBT_DESTMAC))) return -EINVAL; -- cgit v1.2.3-70-g09d2 From 32537e91847a5686d57d3811c075a46b2d9b6434 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Mar 2018 11:53:05 +0200 Subject: netfilter: nf_tables: rename struct nf_chain_type Use nft_ prefix. By when I added chain types, I forgot to use the nftables prefix. Rename enum nft_chain_type to enum nft_chain_types too, otherwise there is an overlap. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 16 ++++++++-------- net/bridge/netfilter/nf_tables_bridge.c | 2 +- net/ipv4/netfilter/nf_tables_arp.c | 2 +- net/ipv4/netfilter/nf_tables_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 2 +- net/ipv4/netfilter/nft_chain_route_ipv4.c | 2 +- net/ipv6/netfilter/nf_tables_ipv6.c | 2 +- net/ipv6/netfilter/nft_chain_nat_ipv6.c | 2 +- net/ipv6/netfilter/nft_chain_route_ipv6.c | 2 +- net/netfilter/nf_tables_api.c | 18 +++++++++--------- net/netfilter/nf_tables_inet.c | 2 +- net/netfilter/nf_tables_netdev.c | 2 +- 12 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 663b015dace5..4a304997c304 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -868,7 +868,7 @@ struct nft_chain { char *name; }; -enum nft_chain_type { +enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, @@ -876,7 +876,7 @@ enum nft_chain_type { }; /** - * struct nf_chain_type - nf_tables chain type info + * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier @@ -885,9 +885,9 @@ enum nft_chain_type { * @hook_mask: mask of valid hooks * @hooks: array of hook functions */ -struct nf_chain_type { +struct nft_chain_type { const char *name; - enum nft_chain_type type; + enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; @@ -895,7 +895,7 @@ struct nf_chain_type { }; int nft_chain_validate_dependency(const struct nft_chain *chain, - enum nft_chain_type type); + enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); @@ -917,7 +917,7 @@ struct nft_stats { */ struct nft_base_chain { struct nf_hook_ops ops; - const struct nf_chain_type *type; + const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; @@ -970,8 +970,8 @@ struct nft_table { char *name; }; -int nft_register_chain_type(const struct nf_chain_type *); -void nft_unregister_chain_type(const struct nf_chain_type *); +int nft_register_chain_type(const struct nft_chain_type *); +void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 5160cf614176..73a1ec556a0a 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -42,7 +42,7 @@ nft_do_chain_bridge(void *priv, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type filter_bridge = { +static const struct nft_chain_type filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_BRIDGE, diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 036c074736b0..5b0be2a10b69 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -27,7 +27,7 @@ nft_do_chain_arp(void *priv, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type filter_arp = { +static const struct nft_chain_type filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_ARP, diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 96f955496d5f..13bae5cfa257 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -30,7 +30,7 @@ static unsigned int nft_do_chain_ipv4(void *priv, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type filter_ipv4 = { +static const struct nft_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV4, diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index f2a490981594..167f377eb1cb 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -67,7 +67,7 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv, return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } -static const struct nf_chain_type nft_chain_nat_ipv4 = { +static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV4, diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index d965c225b9f6..48cf1f892314 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -58,7 +58,7 @@ static unsigned int nf_route_table_hook(void *priv, return ret; } -static const struct nf_chain_type nft_chain_route_ipv4 = { +static const struct nft_chain_type nft_chain_route_ipv4 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV4, diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index 17e03589331c..d99f9ac6f1b6 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -28,7 +28,7 @@ static unsigned int nft_do_chain_ipv6(void *priv, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type filter_ipv6 = { +static const struct nft_chain_type filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index 73fe2bd13fcf..c498aaa8056b 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -65,7 +65,7 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv, return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); } -static const struct nf_chain_type nft_chain_nat_ipv6 = { +static const struct nft_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, .family = NFPROTO_IPV6, diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index 11d3c3b9aa18..d5c7fdc34256 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -60,7 +60,7 @@ static unsigned int nf_route_table_hook(void *priv, return ret; } -static const struct nf_chain_type nft_chain_route_ipv6 = { +static const struct nft_chain_type nft_chain_route_ipv6 = { .name = "route", .type = NFT_CHAIN_T_ROUTE, .family = NFPROTO_IPV6, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 92f5606b0dea..bf564f491085 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -384,9 +384,9 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table) return ++table->hgenerator; } -static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; +static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX]; -static const struct nf_chain_type * +static const struct nft_chain_type * __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) { int i; @@ -399,10 +399,10 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family) return NULL; } -static const struct nf_chain_type * +static const struct nft_chain_type * nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload) { - const struct nf_chain_type *type; + const struct nft_chain_type *type; type = __nf_tables_chain_type_lookup(nla, family); if (type != NULL) @@ -859,7 +859,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) kfree(ctx->table); } -int nft_register_chain_type(const struct nf_chain_type *ctype) +int nft_register_chain_type(const struct nft_chain_type *ctype) { int err = 0; @@ -878,7 +878,7 @@ out: } EXPORT_SYMBOL_GPL(nft_register_chain_type); -void nft_unregister_chain_type(const struct nf_chain_type *ctype) +void nft_unregister_chain_type(const struct nft_chain_type *ctype) { nfnl_lock(NFNL_SUBSYS_NFTABLES); chain_type[ctype->family][ctype->type] = NULL; @@ -1239,7 +1239,7 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) struct nft_chain_hook { u32 num; s32 priority; - const struct nf_chain_type *type; + const struct nft_chain_type *type; struct net_device *dev; }; @@ -1249,7 +1249,7 @@ static int nft_chain_parse_hook(struct net *net, bool create) { struct nlattr *ha[NFTA_HOOK_MAX + 1]; - const struct nf_chain_type *type; + const struct nft_chain_type *type; struct net_device *dev; int err; @@ -6000,7 +6000,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { }; int nft_chain_validate_dependency(const struct nft_chain *chain, - enum nft_chain_type type) + enum nft_chain_types type) { const struct nft_base_chain *basechain; diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index e30c7da09d0d..0aefe66ce558 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -38,7 +38,7 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type filter_inet = { +static const struct nft_chain_type filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_INET, diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 4041fafca934..88ea959211ac 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -38,7 +38,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb, return nft_do_chain(&pkt, priv); } -static const struct nf_chain_type nft_filter_chain_netdev = { +static const struct nft_chain_type nft_filter_chain_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, -- cgit v1.2.3-70-g09d2 From cc07eeb0e5ee18895241460bdccf91a4952731f9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Mar 2018 11:53:06 +0200 Subject: netfilter: nf_tables: nft_register_chain_type() returns void Use WARN_ON() instead since it should not happen that neither family goes over NFPROTO_NUMPROTO nor there is already a chain of this type already registered. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- net/bridge/netfilter/nf_tables_bridge.c | 4 +++- net/ipv4/netfilter/nf_tables_arp.c | 4 +++- net/ipv4/netfilter/nf_tables_ipv4.c | 4 +++- net/ipv4/netfilter/nft_chain_nat_ipv4.c | 6 +----- net/ipv4/netfilter/nft_chain_route_ipv4.c | 4 +++- net/ipv6/netfilter/nf_tables_ipv6.c | 4 +++- net/ipv6/netfilter/nft_chain_nat_ipv6.c | 6 +----- net/ipv6/netfilter/nft_chain_route_ipv6.c | 4 +++- net/netfilter/nf_tables_api.c | 14 +++++--------- net/netfilter/nf_tables_inet.c | 4 +++- net/netfilter/nf_tables_netdev.c | 4 +--- 12 files changed, 30 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4a304997c304..1f7148fe0504 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -970,7 +970,7 @@ struct nft_table { char *name; }; -int nft_register_chain_type(const struct nft_chain_type *); +void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 73a1ec556a0a..ffb8580dfdac 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -63,7 +63,9 @@ static const struct nft_chain_type filter_bridge = { static int __init nf_tables_bridge_init(void) { - return nft_register_chain_type(&filter_bridge); + nft_register_chain_type(&filter_bridge); + + return 0; } static void __exit nf_tables_bridge_exit(void) diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 5b0be2a10b69..c2ee64208743 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -42,7 +42,9 @@ static const struct nft_chain_type filter_arp = { static int __init nf_tables_arp_init(void) { - return nft_register_chain_type(&filter_arp); + nft_register_chain_type(&filter_arp); + + return 0; } static void __exit nf_tables_arp_exit(void) diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c index 13bae5cfa257..c09667de0d68 100644 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ b/net/ipv4/netfilter/nf_tables_ipv4.c @@ -51,7 +51,9 @@ static const struct nft_chain_type filter_ipv4 = { static int __init nf_tables_ipv4_init(void) { - return nft_register_chain_type(&filter_ipv4); + nft_register_chain_type(&filter_ipv4); + + return 0; } static void __exit nf_tables_ipv4_exit(void) diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 167f377eb1cb..9864f5b3279c 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -86,11 +86,7 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = { static int __init nft_chain_nat_init(void) { - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv4); - if (err < 0) - return err; + nft_register_chain_type(&nft_chain_nat_ipv4); return 0; } diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c index 48cf1f892314..7d82934c46f4 100644 --- a/net/ipv4/netfilter/nft_chain_route_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c @@ -71,7 +71,9 @@ static const struct nft_chain_type nft_chain_route_ipv4 = { static int __init nft_chain_route_init(void) { - return nft_register_chain_type(&nft_chain_route_ipv4); + nft_register_chain_type(&nft_chain_route_ipv4); + + return 0; } static void __exit nft_chain_route_exit(void) diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c index d99f9ac6f1b6..496f69453457 100644 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ b/net/ipv6/netfilter/nf_tables_ipv6.c @@ -49,7 +49,9 @@ static const struct nft_chain_type filter_ipv6 = { static int __init nf_tables_ipv6_init(void) { - return nft_register_chain_type(&filter_ipv6); + nft_register_chain_type(&filter_ipv6); + + return 0; } static void __exit nf_tables_ipv6_exit(void) diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index c498aaa8056b..c95d9a97d425 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -84,11 +84,7 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = { static int __init nft_chain_nat_ipv6_init(void) { - int err; - - err = nft_register_chain_type(&nft_chain_nat_ipv6); - if (err < 0) - return err; + nft_register_chain_type(&nft_chain_nat_ipv6); return 0; } diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c index d5c7fdc34256..da3f1f8cb325 100644 --- a/net/ipv6/netfilter/nft_chain_route_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c @@ -73,7 +73,9 @@ static const struct nft_chain_type nft_chain_route_ipv6 = { static int __init nft_chain_route_init(void) { - return nft_register_chain_type(&nft_chain_route_ipv6); + nft_register_chain_type(&nft_chain_route_ipv6); + + return 0; } static void __exit nft_chain_route_exit(void) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bf564f491085..9e4b1614ee39 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -859,22 +859,18 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) kfree(ctx->table); } -int nft_register_chain_type(const struct nft_chain_type *ctype) +void nft_register_chain_type(const struct nft_chain_type *ctype) { - int err = 0; - if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO)) - return -EINVAL; + return; nfnl_lock(NFNL_SUBSYS_NFTABLES); - if (chain_type[ctype->family][ctype->type] != NULL) { - err = -EBUSY; - goto out; + if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) { + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + return; } chain_type[ctype->family][ctype->type] = ctype; -out: nfnl_unlock(NFNL_SUBSYS_NFTABLES); - return err; } EXPORT_SYMBOL_GPL(nft_register_chain_type); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c index 0aefe66ce558..202c4219969b 100644 --- a/net/netfilter/nf_tables_inet.c +++ b/net/netfilter/nf_tables_inet.c @@ -59,7 +59,9 @@ static const struct nft_chain_type filter_inet = { static int __init nf_tables_inet_init(void) { - return nft_register_chain_type(&filter_inet); + nft_register_chain_type(&filter_inet); + + return 0; } static void __exit nf_tables_inet_exit(void) diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 88ea959211ac..4c3835bca63e 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -112,9 +112,7 @@ static int __init nf_tables_netdev_init(void) { int ret; - ret = nft_register_chain_type(&nft_filter_chain_netdev); - if (ret) - return ret; + nft_register_chain_type(&nft_filter_chain_netdev); ret = register_netdevice_notifier(&nf_tables_netdev_notifier); if (ret) -- cgit v1.2.3-70-g09d2 From 02c7b25e5f54321b9063e18d4f52cce07f8e081d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Mar 2018 11:53:07 +0200 Subject: netfilter: nf_tables: build-in filter chain type One module per supported filter chain family type takes too much memory for very little code - too much modularization - place all chain filter definitions in one single file. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 + net/bridge/netfilter/Kconfig | 2 +- net/bridge/netfilter/Makefile | 1 - net/bridge/netfilter/nf_tables_bridge.c | 81 ------- net/ipv4/netfilter/Kconfig | 4 +- net/ipv4/netfilter/Makefile | 2 - net/ipv4/netfilter/nf_tables_arp.c | 60 ----- net/ipv4/netfilter/nf_tables_ipv4.c | 69 ------ net/ipv6/netfilter/Kconfig | 2 +- net/ipv6/netfilter/Makefile | 1 - net/ipv6/netfilter/nf_tables_ipv6.c | 67 ------ net/netfilter/Kconfig | 4 +- net/netfilter/Makefile | 9 +- net/netfilter/nf_tables_api.c | 3 + net/netfilter/nf_tables_inet.c | 77 ------ net/netfilter/nf_tables_netdev.c | 140 ----------- net/netfilter/nft_chain_filter.c | 398 ++++++++++++++++++++++++++++++++ 17 files changed, 414 insertions(+), 509 deletions(-) delete mode 100644 net/bridge/netfilter/nf_tables_bridge.c delete mode 100644 net/ipv4/netfilter/nf_tables_arp.c delete mode 100644 net/ipv4/netfilter/nf_tables_ipv4.c delete mode 100644 net/ipv6/netfilter/nf_tables_ipv6.c delete mode 100644 net/netfilter/nf_tables_inet.c delete mode 100644 net/netfilter/nf_tables_netdev.c create mode 100644 net/netfilter/nft_chain_filter.c (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 1f7148fe0504..77c3c04c27ac 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1345,4 +1345,7 @@ struct nft_trans_flowtable { #define nft_trans_flowtable(trans) \ (((struct nft_trans_flowtable *)trans->data)->flowtable) +int __init nft_chain_filter_init(void); +void __exit nft_chain_filter_fini(void); + #endif /* _NET_NF_TABLES_H */ diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 225d1668dfdd..f212447794bd 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -5,7 +5,7 @@ menuconfig NF_TABLES_BRIDGE depends on BRIDGE && NETFILTER && NF_TABLES select NETFILTER_FAMILY_BRIDGE - tristate "Ethernet Bridge nf_tables support" + bool "Ethernet Bridge nf_tables support" if NF_TABLES_BRIDGE diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 2f28e16de6c7..4bc758dd4a8c 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -3,7 +3,6 @@ # Makefile for the netfilter modules for Link Layer filtering on a bridge. # -obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c deleted file mode 100644 index ffb8580dfdac..000000000000 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy - * Copyright (c) 2013 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned int -nft_do_chain_bridge(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (eth_hdr(skb)->h_proto) { - case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); - break; - case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); - break; - default: - nft_set_pktinfo_unspec(&pkt, skb); - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type filter_bridge = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_BRIDGE, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_BR_PRE_ROUTING) | - (1 << NF_BR_LOCAL_IN) | - (1 << NF_BR_FORWARD) | - (1 << NF_BR_LOCAL_OUT) | - (1 << NF_BR_POST_ROUTING), - .hooks = { - [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, - [NF_BR_LOCAL_IN] = nft_do_chain_bridge, - [NF_BR_FORWARD] = nft_do_chain_bridge, - [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, - [NF_BR_POST_ROUTING] = nft_do_chain_bridge, - }, -}; - -static int __init nf_tables_bridge_init(void) -{ - nft_register_chain_type(&filter_bridge); - - return 0; -} - -static void __exit nf_tables_bridge_exit(void) -{ - nft_unregister_chain_type(&filter_bridge); -} - -module_init(nf_tables_bridge_init); -module_exit(nf_tables_bridge_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter"); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index dfe6fa4ea554..280048e1e395 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -34,7 +34,7 @@ config NF_SOCKET_IPV4 if NF_TABLES config NF_TABLES_IPV4 - tristate "IPv4 nf_tables support" + bool "IPv4 nf_tables support" help This option enables the IPv4 support for nf_tables. @@ -71,7 +71,7 @@ config NFT_FIB_IPV4 endif # NF_TABLES_IPV4 config NF_TABLES_ARP - tristate "ARP nf_tables support" + bool "ARP nf_tables support" select NETFILTER_FAMILY_ARP help This option enables the ARP support for nf_tables. diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 2dad20eefd26..62ede5e3a3de 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -39,7 +39,6 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o -obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o @@ -47,7 +46,6 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o -obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # flow table support obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c deleted file mode 100644 index c2ee64208743..000000000000 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2008-2010 Patrick McHardy - * Copyright (c) 2013 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include -#include -#include -#include - -static unsigned int -nft_do_chain_arp(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_unspec(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type filter_arp = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_ARP, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT), - .hooks = { - [NF_ARP_IN] = nft_do_chain_arp, - [NF_ARP_OUT] = nft_do_chain_arp, - }, -}; - -static int __init nf_tables_arp_init(void) -{ - nft_register_chain_type(&filter_arp); - - return 0; -} - -static void __exit nf_tables_arp_exit(void) -{ - nft_unregister_chain_type(&filter_arp); -} - -module_init(nf_tables_arp_init); -module_exit(nf_tables_arp_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */ diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c deleted file mode 100644 index c09667de0d68..000000000000 --- a/net/ipv4/netfilter/nf_tables_ipv4.c +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy - * Copyright (c) 2012-2013 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned int nft_do_chain_ipv4(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv4(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type filter_ipv4 = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_IPV4, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, - [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, - [NF_INET_FORWARD] = nft_do_chain_ipv4, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, - }, -}; - -static int __init nf_tables_ipv4_init(void) -{ - nft_register_chain_type(&filter_ipv4); - - return 0; -} - -static void __exit nf_tables_ipv4_exit(void) -{ - nft_unregister_chain_type(&filter_ipv4); -} - -module_init(nf_tables_ipv4_init); -module_exit(nf_tables_ipv4_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index d395d1590699..ccbfa83e4bb0 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -34,7 +34,7 @@ config NF_SOCKET_IPV6 if NF_TABLES config NF_TABLES_IPV6 - tristate "IPv6 nf_tables support" + bool "IPv6 nf_tables support" help This option enables the IPv6 support for nf_tables. diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index d984057b8395..44273d6f03a5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o # nf_tables -obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c deleted file mode 100644 index 496f69453457..000000000000 --- a/net/ipv6/netfilter/nf_tables_ipv6.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2008 Patrick McHardy - * Copyright (c) 2012-2013 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Development of this code funded by Astaro AG (http://www.astaro.com/) - */ - -#include -#include -#include -#include -#include -#include - -static unsigned int nft_do_chain_ipv6(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - nft_set_pktinfo_ipv6(&pkt, skb); - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type filter_ipv6 = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_IPV6, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, - [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, - [NF_INET_FORWARD] = nft_do_chain_ipv6, - [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, - [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, - }, -}; - -static int __init nf_tables_ipv6_init(void) -{ - nft_register_chain_type(&filter_ipv6); - - return 0; -} - -static void __exit nf_tables_ipv6_exit(void) -{ - nft_unregister_chain_type(&filter_ipv6); -} - -module_init(nf_tables_ipv6_init); -module_exit(nf_tables_ipv6_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index d3220b43c832..704b3832dbad 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -465,12 +465,12 @@ config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 select NF_TABLES_IPV6 - tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support" + bool "Netfilter nf_tables mixed IPv4/IPv6 tables support" help This option enables support for a mixed IPv4/IPv6 "inet" table. config NF_TABLES_NETDEV - tristate "Netfilter nf_tables netdev tables support" + bool "Netfilter nf_tables netdev tables support" help This option enables support for the "netdev" table. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5d9b8b959e58..fd32bd2c9521 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -73,13 +73,12 @@ obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o # nf_tables -nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \ - nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \ - nft_byteorder.o nft_payload.o nft_lookup.o nft_dynset.o +nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ + nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ + nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ + nft_dynset.o obj-$(CONFIG_NF_TABLES) += nf_tables.o -obj-$(CONFIG_NF_TABLES_INET) += nf_tables_inet.o -obj-$(CONFIG_NF_TABLES_NETDEV) += nf_tables_netdev.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_EXTHDR) += nft_exthdr.o obj-$(CONFIG_NFT_META) += nft_meta.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9e4b1614ee39..97ec1c388bfe 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6584,6 +6584,8 @@ static int __init nf_tables_module_init(void) { int err; + nft_chain_filter_init(); + info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS, GFP_KERNEL); if (info == NULL) { @@ -6618,6 +6620,7 @@ static void __exit nf_tables_module_exit(void) rcu_barrier(); nf_tables_core_module_exit(); kfree(info); + nft_chain_filter_fini(); } module_init(nf_tables_module_init); diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c deleted file mode 100644 index 202c4219969b..000000000000 --- a/net/netfilter/nf_tables_inet.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2012-2014 Patrick McHardy - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (state->pf) { - case NFPROTO_IPV4: - nft_set_pktinfo_ipv4(&pkt, skb); - break; - case NFPROTO_IPV6: - nft_set_pktinfo_ipv6(&pkt, skb); - break; - default: - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type filter_inet = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_INET, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_POST_ROUTING), - .hooks = { - [NF_INET_LOCAL_IN] = nft_do_chain_inet, - [NF_INET_LOCAL_OUT] = nft_do_chain_inet, - [NF_INET_FORWARD] = nft_do_chain_inet, - [NF_INET_PRE_ROUTING] = nft_do_chain_inet, - [NF_INET_POST_ROUTING] = nft_do_chain_inet, - }, -}; - -static int __init nf_tables_inet_init(void) -{ - nft_register_chain_type(&filter_inet); - - return 0; -} - -static void __exit nf_tables_inet_exit(void) -{ - nft_unregister_chain_type(&filter_inet); -} - -module_init(nf_tables_inet_init); -module_exit(nf_tables_inet_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_CHAIN(1, "filter"); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c deleted file mode 100644 index 4c3835bca63e..000000000000 --- a/net/netfilter/nf_tables_netdev.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2015 Pablo Neira Ayuso - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -static unsigned int -nft_do_chain_netdev(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nft_pktinfo pkt; - - nft_set_pktinfo(&pkt, skb, state); - - switch (skb->protocol) { - case htons(ETH_P_IP): - nft_set_pktinfo_ipv4_validate(&pkt, skb); - break; - case htons(ETH_P_IPV6): - nft_set_pktinfo_ipv6_validate(&pkt, skb); - break; - default: - nft_set_pktinfo_unspec(&pkt, skb); - break; - } - - return nft_do_chain(&pkt, priv); -} - -static const struct nft_chain_type nft_filter_chain_netdev = { - .name = "filter", - .type = NFT_CHAIN_T_DEFAULT, - .family = NFPROTO_NETDEV, - .owner = THIS_MODULE, - .hook_mask = (1 << NF_NETDEV_INGRESS), - .hooks = { - [NF_NETDEV_INGRESS] = nft_do_chain_netdev, - }, -}; - -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) -{ - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - - switch (event) { - case NETDEV_UNREGISTER: - if (strcmp(basechain->dev_name, dev->name) != 0) - return; - - __nft_release_basechain(ctx); - break; - case NETDEV_CHANGENAME: - if (dev->ifindex != basechain->ops.dev->ifindex) - return; - - strncpy(basechain->dev_name, dev->name, IFNAMSIZ); - break; - } -} - -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_table *table; - struct nft_chain *chain, *nr; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; - - if (event != NETDEV_UNREGISTER && - event != NETDEV_CHANGENAME) - return NOTIFY_DONE; - - nfnl_lock(NFNL_SUBSYS_NFTABLES); - list_for_each_entry(table, &ctx.net->nft.tables, list) { - if (table->family != NFPROTO_NETDEV) - continue; - - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!nft_is_base_chain(chain)) - continue; - - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); - } - } - nfnl_unlock(NFNL_SUBSYS_NFTABLES); - - return NOTIFY_DONE; -} - -static struct notifier_block nf_tables_netdev_notifier = { - .notifier_call = nf_tables_netdev_event, -}; - -static int __init nf_tables_netdev_init(void) -{ - int ret; - - nft_register_chain_type(&nft_filter_chain_netdev); - - ret = register_netdevice_notifier(&nf_tables_netdev_notifier); - if (ret) - goto err_register_netdevice_notifier; - - return 0; - -err_register_netdevice_notifier: - nft_unregister_chain_type(&nft_filter_chain_netdev); - - return ret; -} - -static void __exit nf_tables_netdev_exit(void) -{ - unregister_netdevice_notifier(&nf_tables_netdev_notifier); - nft_unregister_chain_type(&nft_filter_chain_netdev); -} - -module_init(nf_tables_netdev_init); -module_exit(nf_tables_netdev_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */ diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c new file mode 100644 index 000000000000..84c902477a91 --- /dev/null +++ b/net/netfilter/nft_chain_filter.c @@ -0,0 +1,398 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NF_TABLES_IPV4 +static unsigned int nft_do_chain_ipv4(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv4(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_ipv4 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV4, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, + [NF_INET_FORWARD] = nft_do_chain_ipv4, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, + }, +}; + +static void nft_chain_filter_ipv4_init(void) +{ + nft_register_chain_type(&nft_chain_filter_ipv4); +} +static void nft_chain_filter_ipv4_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_ipv4); +} + +#else +static inline void nft_chain_filter_ipv4_init(void) {} +static inline void nft_chain_filter_ipv4_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV4 */ + +#ifdef CONFIG_NF_TABLES_ARP +static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_unspec(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_arp = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_ARP, + .owner = THIS_MODULE, + .hook_mask = (1 << NF_ARP_IN) | + (1 << NF_ARP_OUT), + .hooks = { + [NF_ARP_IN] = nft_do_chain_arp, + [NF_ARP_OUT] = nft_do_chain_arp, + }, +}; + +static void nft_chain_filter_arp_init(void) +{ + nft_register_chain_type(&nft_chain_filter_arp); +} + +static void nft_chain_filter_arp_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_arp); +} +#else +static inline void nft_chain_filter_arp_init(void) {} +static inline void nft_chain_filter_arp_fini(void) {} +#endif /* CONFIG_NF_TABLES_ARP */ + +#ifdef CONFIG_NF_TABLES_IPV6 +static unsigned int nft_do_chain_ipv6(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + nft_set_pktinfo_ipv6(&pkt, skb); + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_ipv6 = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_IPV6, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, + [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, + [NF_INET_FORWARD] = nft_do_chain_ipv6, + [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, + [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, + }, +}; + +static void nft_chain_filter_ipv6_init(void) +{ + nft_register_chain_type(&nft_chain_filter_ipv6); +} + +static void nft_chain_filter_ipv6_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_ipv6); +} +#else +static inline void nft_chain_filter_ipv6_init(void) {} +static inline void nft_chain_filter_ipv6_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV6 */ + +#ifdef CONFIG_NF_TABLES_INET +static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (state->pf) { + case NFPROTO_IPV4: + nft_set_pktinfo_ipv4(&pkt, skb); + break; + case NFPROTO_IPV6: + nft_set_pktinfo_ipv6(&pkt, skb); + break; + default: + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_inet = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_INET, + .hook_mask = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_POST_ROUTING), + .hooks = { + [NF_INET_LOCAL_IN] = nft_do_chain_inet, + [NF_INET_LOCAL_OUT] = nft_do_chain_inet, + [NF_INET_FORWARD] = nft_do_chain_inet, + [NF_INET_PRE_ROUTING] = nft_do_chain_inet, + [NF_INET_POST_ROUTING] = nft_do_chain_inet, + }, +}; + +static void nft_chain_filter_inet_init(void) +{ + nft_register_chain_type(&nft_chain_filter_inet); +} + +static void nft_chain_filter_inet_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_inet); +} +#else +static inline void nft_chain_filter_inet_init(void) {} +static inline void nft_chain_filter_inet_fini(void) {} +#endif /* CONFIG_NF_TABLES_IPV6 */ + +#ifdef CONFIG_NF_TABLES_BRIDGE +static unsigned int +nft_do_chain_bridge(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + nft_set_pktinfo_ipv4_validate(&pkt, skb); + break; + case htons(ETH_P_IPV6): + nft_set_pktinfo_ipv6_validate(&pkt, skb); + break; + default: + nft_set_pktinfo_unspec(&pkt, skb); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_bridge = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_BRIDGE, + .hook_mask = (1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_IN) | + (1 << NF_BR_FORWARD) | + (1 << NF_BR_LOCAL_OUT) | + (1 << NF_BR_POST_ROUTING), + .hooks = { + [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, + [NF_BR_LOCAL_IN] = nft_do_chain_bridge, + [NF_BR_FORWARD] = nft_do_chain_bridge, + [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, + [NF_BR_POST_ROUTING] = nft_do_chain_bridge, + }, +}; + +static void nft_chain_filter_bridge_init(void) +{ + nft_register_chain_type(&nft_chain_filter_bridge); +} + +static void nft_chain_filter_bridge_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_bridge); +} +#else +static inline void nft_chain_filter_bridge_init(void) {} +static inline void nft_chain_filter_bridge_fini(void) {} +#endif /* CONFIG_NF_TABLES_BRIDGE */ + +#ifdef CONFIG_NF_TABLES_NETDEV +static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) +{ + struct nft_pktinfo pkt; + + nft_set_pktinfo(&pkt, skb, state); + + switch (skb->protocol) { + case htons(ETH_P_IP): + nft_set_pktinfo_ipv4_validate(&pkt, skb); + break; + case htons(ETH_P_IPV6): + nft_set_pktinfo_ipv6_validate(&pkt, skb); + break; + default: + nft_set_pktinfo_unspec(&pkt, skb); + break; + } + + return nft_do_chain(&pkt, priv); +} + +static const struct nft_chain_type nft_chain_filter_netdev = { + .name = "filter", + .type = NFT_CHAIN_T_DEFAULT, + .family = NFPROTO_NETDEV, + .hook_mask = (1 << NF_NETDEV_INGRESS), + .hooks = { + [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + }, +}; + +static void nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_ctx *ctx) +{ + struct nft_base_chain *basechain = nft_base_chain(ctx->chain); + + switch (event) { + case NETDEV_UNREGISTER: + if (strcmp(basechain->dev_name, dev->name) != 0) + return; + + __nft_release_basechain(ctx); + break; + case NETDEV_CHANGENAME: + if (dev->ifindex != basechain->ops.dev->ifindex) + return; + + strncpy(basechain->dev_name, dev->name, IFNAMSIZ); + break; + } +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nft_table *table; + struct nft_chain *chain, *nr; + struct nft_ctx ctx = { + .net = dev_net(dev), + }; + + if (event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nfnl_lock(NFNL_SUBSYS_NFTABLES); + list_for_each_entry(table, &ctx.net->nft.tables, list) { + if (table->family != NFPROTO_NETDEV) + continue; + + ctx.family = table->family; + ctx.table = table; + list_for_each_entry_safe(chain, nr, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; + + ctx.chain = chain; + nft_netdev_event(event, dev, &ctx); + } + } + nfnl_unlock(NFNL_SUBSYS_NFTABLES); + + return NOTIFY_DONE; +} + +static struct notifier_block nf_tables_netdev_notifier = { + .notifier_call = nf_tables_netdev_event, +}; + +static int nft_chain_filter_netdev_init(void) +{ + int err; + + nft_register_chain_type(&nft_chain_filter_netdev); + + err = register_netdevice_notifier(&nf_tables_netdev_notifier); + if (err) + goto err_register_netdevice_notifier; + + return 0; + +err_register_netdevice_notifier: + nft_unregister_chain_type(&nft_chain_filter_netdev); + + return err; +} + +static void nft_chain_filter_netdev_fini(void) +{ + nft_unregister_chain_type(&nft_chain_filter_netdev); + unregister_netdevice_notifier(&nf_tables_netdev_notifier); +} +#else +static inline int nft_chain_filter_netdev_init(void) { return 0; } +static inline void nft_chain_filter_netdev_fini(void) {} +#endif /* CONFIG_NF_TABLES_NETDEV */ + +int __init nft_chain_filter_init(void) +{ + int err; + + err = nft_chain_filter_netdev_init(); + if (err < 0) + return err; + + nft_chain_filter_ipv4_init(); + nft_chain_filter_ipv6_init(); + nft_chain_filter_arp_init(); + nft_chain_filter_inet_init(); + nft_chain_filter_bridge_init(); + + return 0; +} + +void __exit nft_chain_filter_fini(void) +{ + nft_chain_filter_bridge_fini(); + nft_chain_filter_inet_fini(); + nft_chain_filter_arp_fini(); + nft_chain_filter_ipv6_fini(); + nft_chain_filter_ipv4_fini(); + nft_chain_filter_netdev_fini(); +} -- cgit v1.2.3-70-g09d2 From 43a605f2f722b6e08addedae8545b490fca252c4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Mar 2018 11:53:08 +0200 Subject: netfilter: nf_tables: enable conntrack if NAT chain is registered Register conntrack hooks if the user adds NAT chains. Users get confused with the existing behaviour since they will see no packets hitting this chain until they add the first rule that refers to conntrack. This patch adds new ->init() and ->free() indirections to chain types that can be used by NAT chains to invoke the conntrack dependency. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++++ net/ipv4/netfilter/nft_chain_nat_ipv4.c | 12 ++++++++++++ net/ipv6/netfilter/nft_chain_nat_ipv6.c | 12 ++++++++++++ net/netfilter/nf_tables_api.c | 24 +++++++++++++++++------- 4 files changed, 45 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 77c3c04c27ac..e26b94a61a99 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -884,6 +884,8 @@ enum nft_chain_types { * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions + * @init: chain initialization function + * @free: chain release function */ struct nft_chain_type { const char *name; @@ -892,6 +894,8 @@ struct nft_chain_type { struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NF_MAX_HOOKS]; + int (*init)(struct nft_ctx *ctx); + void (*free)(struct nft_ctx *ctx); }; int nft_chain_validate_dependency(const struct nft_chain *chain, diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c index 9864f5b3279c..b5464a3f253b 100644 --- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c +++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c @@ -67,6 +67,16 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv, return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain); } +static int nft_nat_ipv4_init(struct nft_ctx *ctx) +{ + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_nat_ipv4_free(struct nft_ctx *ctx) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + static const struct nft_chain_type nft_chain_nat_ipv4 = { .name = "nat", .type = NFT_CHAIN_T_NAT, @@ -82,6 +92,8 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = { [NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv4_fn, }, + .init = nft_nat_ipv4_init, + .free = nft_nat_ipv4_free, }; static int __init nft_chain_nat_init(void) diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c index c95d9a97d425..3557b114446c 100644 --- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c +++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c @@ -65,6 +65,16 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv, return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain); } +static int nft_nat_ipv6_init(struct nft_ctx *ctx) +{ + return nf_ct_netns_get(ctx->net, ctx->family); +} + +static void nft_nat_ipv6_free(struct nft_ctx *ctx) +{ + nf_ct_netns_put(ctx->net, ctx->family); +} + static const struct nft_chain_type nft_chain_nat_ipv6 = { .name = "nat", .type = NFT_CHAIN_T_NAT, @@ -80,6 +90,8 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = { [NF_INET_LOCAL_OUT] = nft_nat_ipv6_local_fn, [NF_INET_LOCAL_IN] = nft_nat_ipv6_fn, }, + .init = nft_nat_ipv6_init, + .free = nft_nat_ipv6_free, }; static int __init nft_chain_nat_ipv6_init(void) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 97ec1c388bfe..af8b6a7488bd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1211,13 +1211,17 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); } -static void nf_tables_chain_destroy(struct nft_chain *chain) +static void nf_tables_chain_destroy(struct nft_ctx *ctx) { + struct nft_chain *chain = ctx->chain; + BUG_ON(chain->use > 0); if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); + if (basechain->type->free) + basechain->type->free(ctx); module_put(basechain->type->owner); free_percpu(basechain->stats); if (basechain->stats) @@ -1354,6 +1358,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, } basechain->type = hook.type; + if (basechain->type->init) + basechain->type->init(ctx); + chain = &basechain->chain; ops = &basechain->ops; @@ -1374,6 +1381,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (chain == NULL) return -ENOMEM; } + ctx->chain = chain; + INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; @@ -1387,7 +1396,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) goto err1; - ctx->chain = chain; err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1399,7 +1407,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, err2: nf_tables_unregister_hook(net, table, chain); err1: - nf_tables_chain_destroy(chain); + nf_tables_chain_destroy(ctx); return err; } @@ -5678,7 +5686,7 @@ static void nf_tables_commit_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(trans->ctx.chain); + nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_DELRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -5849,7 +5857,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(trans->ctx.chain); + nf_tables_chain_destroy(&trans->ctx); break; case NFT_MSG_NEWRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -6499,7 +6507,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) } list_del(&ctx->chain->list); ctx->table->use--; - nf_tables_chain_destroy(ctx->chain); + nf_tables_chain_destroy(ctx); return 0; } @@ -6515,6 +6523,7 @@ static void __nft_release_tables(struct net *net) struct nft_set *set, *ns; struct nft_ctx ctx = { .net = net, + .family = NFPROTO_NETDEV, }; list_for_each_entry_safe(table, nt, &net->nft.tables, list) { @@ -6551,9 +6560,10 @@ static void __nft_release_tables(struct net *net) nft_obj_destroy(obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { + ctx.chain = chain; list_del(&chain->list); table->use--; - nf_tables_chain_destroy(chain); + nf_tables_chain_destroy(&ctx); } list_del(&table->list); nf_tables_table_destroy(&ctx); -- cgit v1.2.3-70-g09d2 From 10659cbab72b7bfee1a886018d1915a9549b6378 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Mar 2018 12:06:49 +0200 Subject: netfilter: nf_tables: rename to nft_set_lookup_global() To prepare shorter introduction of shorter function prefix. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 10 +++++----- net/netfilter/nf_tables_api.c | 12 ++++++------ net/netfilter/nft_dynset.c | 5 +++-- net/netfilter/nft_lookup.c | 4 ++-- net/netfilter/nft_objref.c | 5 +++-- 5 files changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e26b94a61a99..bd2a18d66189 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -434,11 +434,11 @@ static inline struct nft_set *nft_set_container_of(const void *priv) return (void *)priv - offsetof(struct nft_set, data); } -struct nft_set *nft_set_lookup(const struct net *net, - const struct nft_table *table, - const struct nlattr *nla_set_name, - const struct nlattr *nla_set_id, - u8 genmask); +struct nft_set *nft_set_lookup_global(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index af8b6a7488bd..769d84015073 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2633,11 +2633,11 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -struct nft_set *nft_set_lookup(const struct net *net, - const struct nft_table *table, - const struct nlattr *nla_set_name, - const struct nlattr *nla_set_id, - u8 genmask) +struct nft_set *nft_set_lookup_global(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask) { struct nft_set *set; @@ -2650,7 +2650,7 @@ struct nft_set *nft_set_lookup(const struct net *net, } return set; } -EXPORT_SYMBOL_GPL(nft_set_lookup); +EXPORT_SYMBOL_GPL(nft_set_lookup_global); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index fc83e29d6634..04863fad05dd 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -132,8 +132,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->invert = true; } - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME], - tb[NFTA_DYNSET_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_DYNSET_SET_NAME], + tb[NFTA_DYNSET_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 475570e89ede..f52da5e2199f 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -71,8 +71,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx, tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], - tb[NFTA_LOOKUP_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], + tb[NFTA_LOOKUP_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 7bcdc48f3d73..0b02407773ad 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -117,8 +117,9 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, struct nft_set *set; int err; - set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], - tb[NFTA_OBJREF_SET_ID], genmask); + set = nft_set_lookup_global(ctx->net, ctx->table, + tb[NFTA_OBJREF_SET_NAME], + tb[NFTA_OBJREF_SET_ID], genmask); if (IS_ERR(set)) return PTR_ERR(set); -- cgit v1.2.3-70-g09d2 From a3073c17dd8cd041d5cf68f28a80a54e310f2f45 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Mar 2018 12:06:50 +0200 Subject: netfilter: nf_tables: use nft_set_lookup_global from nf_tables_newsetelem() Replace opencoded implementation of nft_set_lookup_global() by call to this function. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 769d84015073..2bd80fa9b070 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4032,17 +4032,10 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (err < 0) return err; - set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET], - genmask); - if (IS_ERR(set)) { - if (nla[NFTA_SET_ELEM_LIST_SET_ID]) { - set = nf_tables_set_lookup_byid(net, - nla[NFTA_SET_ELEM_LIST_SET_ID], - genmask); - } - if (IS_ERR(set)) - return PTR_ERR(set); - } + set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET], + nla[NFTA_SET_ELEM_LIST_SET_ID], genmask); + if (IS_ERR(set)) + return PTR_ERR(set); if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT) return -EBUSY; -- cgit v1.2.3-70-g09d2 From c47d36b3855d804b2e282f9b4eecbbd19b5453f9 Mon Sep 17 00:00:00 2001 From: Arushi Singhal Date: Thu, 29 Mar 2018 00:39:50 +0530 Subject: netfilter: Merge assignment with return Merge assignment with return statement to directly return the value. Signed-off-by: Arushi Singhal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 5 ++--- net/netfilter/xt_hashlimit.c | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 11ef85a57244..b00e84bf4107 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1527,9 +1527,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) if (ret < 0) return ret; - ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, - cda[CTA_NAT_SRC]); - return ret; + return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC, + cda[CTA_NAT_SRC]); #else if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC]) return 0; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index db2fe0911740..64fc3721d74c 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -534,8 +534,7 @@ static u64 user2rate_bytes(u32 user) u64 r; r = user ? U32_MAX / user : U32_MAX; - r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; - return r; + return (r - 1) << XT_HASHLIMIT_BYTE_SHIFT; } static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, -- cgit v1.2.3-70-g09d2 From 9ba5c404bf1d6284f0269411b33394362b7ff405 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 29 Mar 2018 15:12:41 +0100 Subject: netfilter: x_tables: Add note about how to free percpu counters Due to the way percpu counters are allocated and freed in blocks, it is not safe to free counters individually. Currently all callers do the right thing, but let's note this restriction. Fixes: ae0ac0ed6fcf ("netfilter: x_tables: pack percpu counter allocations") Signed-off-by: Ben Hutchings Signed-off-by: Pablo Neira Ayuso --- net/netfilter/x_tables.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index bac932f1c582..75cd5196b29b 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1854,7 +1854,9 @@ EXPORT_SYMBOL_GPL(xt_proto_fini); * to fetch the real percpu counter. * * To speed up allocation and improve data locality, a 4kb block is - * allocated. + * allocated. Freeing any counter may free an entire block, so all + * counters allocated using the same state must be freed at the same + * time. * * xt_percpu_counter_alloc_state contains the base address of the * allocated page and the current sub-offset. -- cgit v1.2.3-70-g09d2 From e3b5e1ec75234fb6b27708a316cdf69f9fb176a8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 Mar 2018 11:39:12 +0200 Subject: Revert "netfilter: x_tables: ensure last rule in base chain matches underflow/policy" This reverts commit 0d7df906a0e78079a02108b06d32c3ef2238ad25. Valdis Kletnieks reported that xtables is broken in linux-next since 0d7df906a0e78 ("netfilter: x_tables: ensure last rule in base chain matches underflow/policy"), as kernel rejects the (well-formed) ruleset: [ 64.402790] ip6_tables: last base chain position 1136 doesn't match underflow 1344 (hook 1) mark_source_chains is not the correct place for such a check, as it terminates evaluation of a chain once it sees an unconditional verdict (following rules are known to be unreachable). It seems preferrable to fix libiptc instead, so remove this check again. Fixes: 0d7df906a0e78 ("netfilter: x_tables: ensure last rule in base chain matches underflow/policy") Reported-by: Valdis Kletnieks Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 17 +---------------- net/ipv4/netfilter/ip_tables.c | 17 +---------------- net/ipv6/netfilter/ip6_tables.c | 17 +---------------- 3 files changed, 3 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index f366ff1cfc19..aaafdbd15ad3 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -309,13 +309,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e = entry0 + pos; - unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; - depth = 0; - last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -346,8 +343,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; - if (depth) - --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -372,9 +367,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - - if (entry0 + newpos != arpt_next_entry(e)) - ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -385,15 +377,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } - if (depth == 0) - last_pos = pos; - } -next: - if (last_pos != newinfo->underflow[hook]) { - pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", - last_pos, newinfo->underflow[hook], hook); - return 0; } +next: ; } return 1; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2362ca2c9e0c..f9063513f9d1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -378,13 +378,10 @@ mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e = entry0 + pos; - unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; - depth = 0; - last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -413,8 +410,6 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; - if (depth) - --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -439,9 +434,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - - if (entry0 + newpos != ipt_next_entry(e)) - ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -452,15 +444,8 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } - if (depth == 0) - last_pos = pos; - } -next: - if (last_pos != newinfo->underflow[hook]) { - pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", - last_pos, newinfo->underflow[hook], hook); - return 0; } +next: ; } return 1; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 004508753abc..3c36a4c77f29 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -396,13 +396,10 @@ mark_source_chains(const struct xt_table_info *newinfo, for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e = entry0 + pos; - unsigned int last_pos, depth; if (!(valid_hooks & (1 << hook))) continue; - depth = 0; - last_pos = pos; /* Set initial back pointer. */ e->counters.pcnt = pos; @@ -431,8 +428,6 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = e->counters.pcnt; e->counters.pcnt = 0; - if (depth) - --depth; /* We're at the start. */ if (pos == oldpos) goto next; @@ -457,9 +452,6 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - - if (entry0 + newpos != ip6t_next_entry(e)) - ++depth; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -470,15 +462,8 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; pos = newpos; } - if (depth == 0) - last_pos = pos; - } -next: - if (last_pos != newinfo->underflow[hook]) { - pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n", - last_pos, newinfo->underflow[hook], hook); - return 0; } +next: ; } return 1; } -- cgit v1.2.3-70-g09d2 From 26c97c5d8dac6bc56d4360561a286f52543ac07e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 20 Mar 2018 10:35:47 -0700 Subject: netfilter: ipset: Use is_zero_ether_addr instead of static and memcmp To make the test a bit clearer and to reduce object size a little. Miscellanea: o remove now unnecessary static const array $ size ip_set_hash_mac.o* text data bss dec hex filename 22822 4619 64 27505 6b71 ip_set_hash_mac.o.allyesconfig.new 22932 4683 64 27679 6c1f ip_set_hash_mac.o.allyesconfig.old 10443 1040 0 11483 2cdb ip_set_hash_mac.o.defconfig.new 10507 1040 0 11547 2d1b ip_set_hash_mac.o.defconfig.old Signed-off-by: Joe Perches Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_mac.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c index 8f004edad396..f9d5a2a1e3d0 100644 --- a/net/netfilter/ipset/ip_set_hash_mac.c +++ b/net/netfilter/ipset/ip_set_hash_mac.c @@ -72,9 +72,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next, #define IP_SET_PROTO_UNDEF #include "ip_set_hash_gen.h" -/* Zero valued element is not supported */ -static const unsigned char invalid_ether[ETH_ALEN] = { 0 }; - static int hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, @@ -93,7 +90,7 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb, return -EINVAL; ether_addr_copy(e.ether, eth_hdr(skb)->h_source); - if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -118,7 +115,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER])); - if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0) + if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); -- cgit v1.2.3-70-g09d2 From 9daae9bd47cff82a2a06aca23c458d6c79d09d52 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 28 Mar 2018 17:46:54 +0300 Subject: net: Call add/kill vid ndo on vlan filter feature toggling NETIF_F_HW_VLAN_[CS]TAG_FILTER features require more than just a bit flip in dev->features in order to keep the driver in a consistent state. These features notify the driver of each added/removed vlan, but toggling of vlan-filter does not notify the driver accordingly for each of the existing vlans. This patch implements a similar solution to NETIF_F_RX_UDP_TUNNEL_PORT behavior (which notifies the driver about UDP ports in the same manner that vids are reported). Each toggling of the features propagates to the 8021q module, which iterates over the vlans and call add/kill ndo accordingly. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 24 +++++++++++ include/linux/netdevice.h | 4 ++ net/8021q/vlan.c | 21 ++++++++++ net/8021q/vlan.h | 3 ++ net/8021q/vlan_core.c | 101 ++++++++++++++++++++++++++++++++++------------ net/core/dev.c | 20 +++++++++ 6 files changed, 148 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c4a1cff9c768..24d1976c1e61 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -83,6 +83,30 @@ static inline bool is_vlan_dev(const struct net_device *dev) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_prio(__skb) ((__skb)->vlan_tci & VLAN_PRIO_MASK) +static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); +} + +static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); +} + +static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); +} + +static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); +} + /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a2d9cf50aa2..da44dab492e3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2349,6 +2349,10 @@ enum netdev_cmd { NETDEV_UDP_TUNNEL_PUSH_INFO, NETDEV_UDP_TUNNEL_DROP_INFO, NETDEV_CHANGE_TX_QUEUE_LEN, + NETDEV_CVLAN_FILTER_PUSH_INFO, + NETDEV_CVLAN_FILTER_DROP_INFO, + NETDEV_SVLAN_FILTER_PUSH_INFO, + NETDEV_SVLAN_FILTER_DROP_INFO, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index bad01b14a4ad..5505ee6ebdbe 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -360,6 +360,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); + int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); @@ -489,6 +490,26 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; + + case NETDEV_CVLAN_FILTER_PUSH_INFO: + err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_CVLAN_FILTER_DROP_INFO: + vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); + break; + + case NETDEV_SVLAN_FILTER_PUSH_INFO: + err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); + if (err) + return notifier_from_errno(err); + break; + + case NETDEV_SVLAN_FILTER_DROP_INFO: + vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); + break; } out: diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index a8ba51030b75..e23aac3e4d37 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -97,6 +97,9 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev, if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \ (i) % VLAN_N_VID))) +int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto); +void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto); + /* found in vlan_dev.c */ void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio); diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 45c9bf5ff3a0..c8d7abdc0463 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -165,13 +165,12 @@ struct vlan_vid_info { int refcount; }; -static bool vlan_hw_filter_capable(const struct net_device *dev, - const struct vlan_vid_info *vid_info) +bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto) { - if (vid_info->proto == htons(ETH_P_8021Q) && + if (proto == htons(ETH_P_8021Q) && dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) return true; - if (vid_info->proto == htons(ETH_P_8021AD) && + if (proto == htons(ETH_P_8021AD) && dev->features & NETIF_F_HW_VLAN_STAG_FILTER) return true; return false; @@ -202,11 +201,73 @@ static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid) return vid_info; } +static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) +{ + if (!vlan_hw_filter_capable(dev, proto)) + return 0; + + if (netif_device_present(dev)) + return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid); + else + return -ENODEV; +} + +static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) +{ + if (!vlan_hw_filter_capable(dev, proto)) + return 0; + + if (netif_device_present(dev)) + return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid); + else + return -ENODEV; +} + +int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) +{ + struct net_device *real_dev = vlan_info->real_dev; + struct vlan_vid_info *vlan_vid_info; + int err; + + list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) { + if (vlan_vid_info->proto == proto) { + err = vlan_add_rx_filter_info(real_dev, proto, + vlan_vid_info->vid); + if (err) + goto unwind; + } + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(vlan_vid_info, + &vlan_info->vid_list, list) { + if (vlan_vid_info->proto == proto) + vlan_kill_rx_filter_info(real_dev, proto, + vlan_vid_info->vid); + } + + return err; +} +EXPORT_SYMBOL(vlan_filter_push_vids); + +void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto) +{ + struct vlan_vid_info *vlan_vid_info; + + list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) + if (vlan_vid_info->proto == proto) + vlan_kill_rx_filter_info(vlan_info->real_dev, + vlan_vid_info->proto, + vlan_vid_info->vid); +} +EXPORT_SYMBOL(vlan_filter_drop_vids); + static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, struct vlan_vid_info **pvid_info) { struct net_device *dev = vlan_info->real_dev; - const struct net_device_ops *ops = dev->netdev_ops; struct vlan_vid_info *vid_info; int err; @@ -214,16 +275,12 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, if (!vid_info) return -ENOMEM; - if (vlan_hw_filter_capable(dev, vid_info)) { - if (netif_device_present(dev)) - err = ops->ndo_vlan_rx_add_vid(dev, proto, vid); - else - err = -ENODEV; - if (err) { - kfree(vid_info); - return err; - } + err = vlan_add_rx_filter_info(dev, proto, vid); + if (err) { + kfree(vid_info); + return err; } + list_add(&vid_info->list, &vlan_info->vid_list); vlan_info->nr_vids++; *pvid_info = vid_info; @@ -270,21 +327,15 @@ static void __vlan_vid_del(struct vlan_info *vlan_info, struct vlan_vid_info *vid_info) { struct net_device *dev = vlan_info->real_dev; - const struct net_device_ops *ops = dev->netdev_ops; __be16 proto = vid_info->proto; u16 vid = vid_info->vid; int err; - if (vlan_hw_filter_capable(dev, vid_info)) { - if (netif_device_present(dev)) - err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid); - else - err = -ENODEV; - if (err) { - pr_warn("failed to kill vid %04x/%d for device %s\n", - proto, vid, dev->name); - } - } + err = vlan_kill_rx_filter_info(dev, proto, vid); + if (err) + pr_warn("failed to kill vid %04x/%d for device %s\n", + proto, vid, dev->name); + list_del(&vid_info->list); kfree(vid_info); vlan_info->nr_vids--; diff --git a/net/core/dev.c b/net/core/dev.c index eca5458b2753..1ddb6b9c58a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1584,6 +1584,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) + N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) }; #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -7665,6 +7667,24 @@ sync_lower: } } + if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_ctag_filter_info(dev); + } else { + vlan_drop_rx_ctag_filter_info(dev); + } + } + + if (diff & NETIF_F_HW_VLAN_STAG_FILTER) { + if (features & NETIF_F_HW_VLAN_STAG_FILTER) { + dev->features = features; + err |= vlan_get_rx_stag_filter_info(dev); + } else { + vlan_drop_rx_stag_filter_info(dev); + } + } + dev->features = features; } -- cgit v1.2.3-70-g09d2 From e679c9c1dbfdba07b2a979a076cca74b773be8ce Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 28 Mar 2018 15:44:16 -0700 Subject: sfp/phylink: move module EEPROM ethtool access into netdev core ethtool Provide a pointer to the SFP bus in struct net_device, so that the ethtool module EEPROM methods can access the SFP directly, rather than needing every user to provide a hook for it. Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 18 ------------------ drivers/net/phy/phylink.c | 28 ---------------------------- drivers/net/phy/sfp-bus.c | 6 ++---- include/linux/netdevice.h | 3 +++ include/linux/phylink.h | 3 --- net/core/ethtool.c | 7 +++++++ 6 files changed, 12 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index cd09bde55596..25ced96750bf 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -4075,22 +4075,6 @@ static int mvneta_ethtool_set_wol(struct net_device *dev, return ret; } -static int mvneta_ethtool_get_module_info(struct net_device *dev, - struct ethtool_modinfo *modinfo) -{ - struct mvneta_port *pp = netdev_priv(dev); - - return phylink_ethtool_get_module_info(pp->phylink, modinfo); -} - -static int mvneta_ethtool_get_module_eeprom(struct net_device *dev, - struct ethtool_eeprom *ee, u8 *buf) -{ - struct mvneta_port *pp = netdev_priv(dev); - - return phylink_ethtool_get_module_eeprom(pp->phylink, ee, buf); -} - static int mvneta_ethtool_get_eee(struct net_device *dev, struct ethtool_eee *eee) { @@ -4165,8 +4149,6 @@ static const struct ethtool_ops mvneta_eth_tool_ops = { .set_link_ksettings = mvneta_ethtool_set_link_ksettings, .get_wol = mvneta_ethtool_get_wol, .set_wol = mvneta_ethtool_set_wol, - .get_module_info = mvneta_ethtool_get_module_info, - .get_module_eeprom = mvneta_ethtool_get_module_eeprom, .get_eee = mvneta_ethtool_get_eee, .set_eee = mvneta_ethtool_set_eee, }; diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 9b1e4721ea3a..c582b2d7546c 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1250,34 +1250,6 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, } EXPORT_SYMBOL_GPL(phylink_ethtool_set_pauseparam); -int phylink_ethtool_get_module_info(struct phylink *pl, - struct ethtool_modinfo *modinfo) -{ - int ret = -EOPNOTSUPP; - - WARN_ON(!lockdep_rtnl_is_held()); - - if (pl->sfp_bus) - ret = sfp_get_module_info(pl->sfp_bus, modinfo); - - return ret; -} -EXPORT_SYMBOL_GPL(phylink_ethtool_get_module_info); - -int phylink_ethtool_get_module_eeprom(struct phylink *pl, - struct ethtool_eeprom *ee, u8 *buf) -{ - int ret = -EOPNOTSUPP; - - WARN_ON(!lockdep_rtnl_is_held()); - - if (pl->sfp_bus) - ret = sfp_get_module_eeprom(pl->sfp_bus, ee, buf); - - return ret; -} -EXPORT_SYMBOL_GPL(phylink_ethtool_get_module_eeprom); - /** * phylink_ethtool_get_eee_err() - read the energy efficient ethernet error * counter diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 3d4ff5d0d2a6..0381da78d228 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -342,6 +342,7 @@ static int sfp_register_bus(struct sfp_bus *bus) } if (bus->started) bus->socket_ops->start(bus->sfp); + bus->netdev->sfp_bus = bus; bus->registered = true; return 0; } @@ -356,6 +357,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus) if (bus->phydev && ops && ops->disconnect_phy) ops->disconnect_phy(bus->upstream); } + bus->netdev->sfp_bus = NULL; bus->registered = false; } @@ -371,8 +373,6 @@ static void sfp_unregister_bus(struct sfp_bus *bus) */ int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo) { - if (!bus->registered) - return -ENOIOCTLCMD; return bus->socket_ops->module_info(bus->sfp, modinfo); } EXPORT_SYMBOL_GPL(sfp_get_module_info); @@ -391,8 +391,6 @@ EXPORT_SYMBOL_GPL(sfp_get_module_info); int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data) { - if (!bus->registered) - return -ENOIOCTLCMD; return bus->socket_ops->module_eeprom(bus->sfp, ee, data); } EXPORT_SYMBOL_GPL(sfp_get_module_eeprom); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index da44dab492e3..cf44503ea81a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -58,6 +58,7 @@ struct device; struct phy_device; struct dsa_port; +struct sfp_bus; /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ @@ -1662,6 +1663,7 @@ enum netdev_priv_flags { * @priomap: XXX: need comments on this one * @phydev: Physical device may attach itself * for hardware timestamping + * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount @@ -1945,6 +1947,7 @@ struct net_device { struct netprio_map __rcu *priomap; #endif struct phy_device *phydev; + struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; diff --git a/include/linux/phylink.h b/include/linux/phylink.h index e95cc12030fa..50eeae025f1e 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -219,9 +219,6 @@ void phylink_ethtool_get_pauseparam(struct phylink *, struct ethtool_pauseparam *); int phylink_ethtool_set_pauseparam(struct phylink *, struct ethtool_pauseparam *); -int phylink_ethtool_get_module_info(struct phylink *, struct ethtool_modinfo *); -int phylink_ethtool_get_module_eeprom(struct phylink *, - struct ethtool_eeprom *, u8 *); int phylink_get_eee_err(struct phylink *); int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *); int phylink_ethtool_set_eee(struct phylink *, struct ethtool_eee *); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index bb6e498c6e3d..eb55252ca1fb 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -2245,6 +2246,9 @@ static int __ethtool_get_module_info(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + if (dev->sfp_bus) + return sfp_get_module_info(dev->sfp_bus, modinfo); + if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); @@ -2279,6 +2283,9 @@ static int __ethtool_get_module_eeprom(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; + if (dev->sfp_bus) + return sfp_get_module_eeprom(dev->sfp_bus, ee, data); + if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); -- cgit v1.2.3-70-g09d2 From e9a441b6e729e16092fcc18e3962b952a01d1e3c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 17:03:25 +0300 Subject: xfrm: Register xfrm_dev_notifier in appropriate place Currently, driver registers it from pernet_operations::init method, and this breaks modularity, because initialization of net namespace and netdevice notifiers are orthogonal actions. We don't have per-namespace netdevice notifiers; all of them are global for all devices in all namespaces. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_device.c | 2 +- net/xfrm/xfrm_policy.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index aa027ba1d032..a872379b69da 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1894,7 +1894,7 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) #endif } -void __net_init xfrm_dev_init(void); +void __init xfrm_dev_init(void); #ifdef CONFIG_XFRM_OFFLOAD void xfrm_dev_resume(struct sk_buff *skb); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index e87d6c4dd5b6..175941e15a6e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -350,7 +350,7 @@ static struct notifier_block xfrm_dev_notifier = { .notifier_call = xfrm_dev_event, }; -void __net_init xfrm_dev_init(void) +void __init xfrm_dev_init(void) { register_netdevice_notifier(&xfrm_dev_notifier); } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0e065db6c7c0..40b54cc64243 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2895,8 +2895,6 @@ static int __net_init xfrm_policy_init(struct net *net) INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); - if (net_eq(net, &init_net)) - xfrm_dev_init(); return 0; out_bydst: @@ -2999,6 +2997,7 @@ void __init xfrm_init(void) INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn); register_pernet_subsys(&xfrm_net_ops); + xfrm_dev_init(); seqcount_init(&xfrm_policy_hash_generation); xfrm_input_init(); } -- cgit v1.2.3-70-g09d2 From 9e2f6c5d78db6647eb9d7bfeb20b9e0f9ff2c56c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 17:03:35 +0300 Subject: netfilter: Rework xt_TEE netdevice notifier Register netdevice notifier for every iptable entry is not good, since this breaks modularity, and the hidden synchronization is based on rtnl_lock(). This patch reworks the synchronization via new lock, while the rest of logic remains as it was before. This is required for the next patch. Tested via: while :; do unshare -n iptables -t mangle -A OUTPUT -j TEE --gateway 1.1.1.2 --oif lo; done Signed-off-by: Kirill Tkhai Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/xt_TEE.c | 73 +++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 86b0580b2216..475957cfcf50 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -20,7 +20,7 @@ #include struct xt_tee_priv { - struct notifier_block notifier; + struct list_head list; struct xt_tee_tginfo *tginfo; int oif; }; @@ -51,29 +51,35 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static DEFINE_MUTEX(priv_list_mutex); +static LIST_HEAD(priv_list); + static int tee_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct xt_tee_priv *priv; - priv = container_of(this, struct xt_tee_priv, notifier); - switch (event) { - case NETDEV_REGISTER: - if (!strcmp(dev->name, priv->tginfo->oif)) - priv->oif = dev->ifindex; - break; - case NETDEV_UNREGISTER: - if (dev->ifindex == priv->oif) - priv->oif = -1; - break; - case NETDEV_CHANGENAME: - if (!strcmp(dev->name, priv->tginfo->oif)) - priv->oif = dev->ifindex; - else if (dev->ifindex == priv->oif) - priv->oif = -1; - break; + mutex_lock(&priv_list_mutex); + list_for_each_entry(priv, &priv_list, list) { + switch (event) { + case NETDEV_REGISTER: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + break; + case NETDEV_UNREGISTER: + if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + case NETDEV_CHANGENAME: + if (!strcmp(dev->name, priv->tginfo->oif)) + priv->oif = dev->ifindex; + else if (dev->ifindex == priv->oif) + priv->oif = -1; + break; + } } + mutex_unlock(&priv_list_mutex); return NOTIFY_DONE; } @@ -89,8 +95,6 @@ static int tee_tg_check(const struct xt_tgchk_param *par) return -EINVAL; if (info->oif[0]) { - int ret; - if (info->oif[sizeof(info->oif)-1] != '\0') return -EINVAL; @@ -100,14 +104,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par) priv->tginfo = info; priv->oif = -1; - priv->notifier.notifier_call = tee_netdev_event; info->priv = priv; - ret = register_netdevice_notifier(&priv->notifier); - if (ret) { - kfree(priv); - return ret; - } + mutex_lock(&priv_list_mutex); + list_add(&priv->list, &priv_list); + mutex_unlock(&priv_list_mutex); } else info->priv = NULL; @@ -120,7 +121,9 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) struct xt_tee_tginfo *info = par->targinfo; if (info->priv) { - unregister_netdevice_notifier(&info->priv->notifier); + mutex_lock(&priv_list_mutex); + list_del(&info->priv->list); + mutex_unlock(&priv_list_mutex); kfree(info->priv); } static_key_slow_dec(&xt_tee_enabled); @@ -153,13 +156,29 @@ static struct xt_target tee_tg_reg[] __read_mostly = { #endif }; +static struct notifier_block tee_netdev_notifier = { + .notifier_call = tee_netdev_event, +}; + static int __init tee_tg_init(void) { - return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + int ret; + + ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + if (ret) + return ret; + ret = register_netdevice_notifier(&tee_netdev_notifier); + if (ret) { + xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); + return ret; + } + + return 0; } static void __exit tee_tg_exit(void) { + unregister_netdevice_notifier(&tee_netdev_notifier); xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg)); } -- cgit v1.2.3-70-g09d2 From 328fbe747ad4622f0dfa98d2e19e836f03f80c04 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 17:03:45 +0300 Subject: net: Close race between {un, }register_netdevice_notifier() and setup_net()/cleanup_net() {un,}register_netdevice_notifier() iterate over all net namespaces hashed to net_namespace_list. But pernet_operations register and unregister netdevices in unhashed net namespace, and they are not seen for netdevice notifiers. This results in asymmetry: 1)Race with register_netdevice_notifier() pernet_operations::init(net) ... register_netdevice() ... call_netdevice_notifiers() ... ... nb is not called ... ... register_netdevice_notifier(nb) -> net skipped ... ... list_add_tail(&net->list, ..) ... Then, userspace stops using net, and it's destructed: pernet_operations::exit(net) unregister_netdevice() call_netdevice_notifiers() ... nb is called ... This always happens with net::loopback_dev, but it may be not the only device. 2)Race with unregister_netdevice_notifier() pernet_operations::init(net) register_netdevice() call_netdevice_notifiers() ... nb is called ... Then, userspace stops using net, and it's destructed: list_del_rcu(&net->list) ... pernet_operations::exit(net) unregister_netdevice_notifier(nb) -> net skipped dev_change_net_namespace() ... call_netdevice_notifiers() ... nb is not called ... unregister_netdevice() call_netdevice_notifiers() ... nb is not called ... This race is more danger, since dev_change_net_namespace() moves real network devices, which use not trivial netdevice notifiers, and if this will happen, the system will be left in unpredictable state. The patch closes the race. During the testing I found two places, where register_netdevice_notifier() is called from pernet init/exit methods (which led to deadlock) and fixed them (see previous patches). The review moved me to one more unusual registration place: raw_init() (can driver). It may be a reason of problems, if someone creates in-kernel CAN_RAW sockets, since they will be destroyed in exit method and raw_release() will call unregister_netdevice_notifier(). But grep over kernel tree does not show, someone creates such sockets from kernel space. Theoretically, there can be more places like this, and which are hidden from review, but we found them on the first bumping there (since there is no a race, it will be 100% reproducible). Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1ddb6b9c58a8..07da7add4845 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1625,6 +1625,8 @@ int register_netdevice_notifier(struct notifier_block *nb) struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_register(&netdev_chain, nb); if (err) @@ -1649,6 +1651,7 @@ int register_netdevice_notifier(struct notifier_block *nb) unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; rollback: @@ -1694,6 +1697,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb) struct net *net; int err; + /* Close race with setup_net() and cleanup_net() */ + down_write(&pernet_ops_rwsem); rtnl_lock(); err = raw_notifier_chain_unregister(&netdev_chain, nb); if (err) @@ -1713,6 +1718,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb) up_read(&net_rwsem); unlock: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } EXPORT_SYMBOL(unregister_netdevice_notifier); -- cgit v1.2.3-70-g09d2 From 428604fb118facce1309670779a35baf27ad044c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 29 Mar 2018 11:02:24 +0200 Subject: ipv6: do not set routes if disable_ipv6 has been enabled Do not allow setting ipv6 routes from userspace if disable_ipv6 has been enabled. The issue can be triggered using the following reproducer: - sysctl net.ipv6.conf.all.disable_ipv6=1 - ip -6 route add a:b:c:d::/64 dev em1 - ip -6 route show a:b:c:d::/64 dev em1 metric 1024 pref medium Fix it checking disable_ipv6 value in ip6_route_info_create routine Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ba8d5df50ebe..e461ef1158b6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2917,6 +2917,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, if (!dev) goto out; + if (idev->cnf.disable_ipv6) { + NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); + err = -EACCES; + goto out; + } + if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; -- cgit v1.2.3-70-g09d2 From ace45bec6d77bc061c3c3d8ad99e298ea9800c2b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:04:43 +0100 Subject: rxrpc: Fix firewall route keepalive Fix the firewall route keepalive part of AF_RXRPC which is currently function incorrectly by replying to VERSION REPLY packets from the server with VERSION REQUEST packets. Instead, send VERSION REPLY packets to the peers of service connections to act as keep-alives 20s after the latest packet was transmitted to that peer. Also, just discard VERSION REPLY packets rather than replying to them. Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 4 +++ net/rxrpc/ar-internal.h | 14 +++++++- net/rxrpc/conn_event.c | 3 ++ net/rxrpc/input.c | 2 ++ net/rxrpc/net_ns.c | 21 ++++++++++- net/rxrpc/output.c | 59 +++++++++++++++++++++++++++++- net/rxrpc/peer_event.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/peer_object.c | 7 +++- net/rxrpc/rxkad.c | 2 ++ 9 files changed, 204 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ec5ec68be1aa..0b3026b8fa40 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -762,6 +762,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, static int rxrpc_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct rxrpc_net *rxnet; struct rxrpc_sock *rx; struct sock *sk; @@ -801,6 +802,9 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); + rxnet = rxrpc_net(sock_net(&rx->sk)); + timer_reduce(&rxnet->peer_keepalive_timer, jiffies + 1); + _leave(" = 0 [%p]", rx); return 0; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 21cf164b6d85..8a348e0a9d95 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -97,8 +97,16 @@ struct rxrpc_net { struct list_head local_endpoints; struct mutex local_mutex; /* Lock for ->local_endpoints */ - spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ DECLARE_HASHTABLE (peer_hash, 10); + spinlock_t peer_hash_lock; /* Lock for ->peer_hash */ + +#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */ + u8 peer_keepalive_cursor; + ktime_t peer_keepalive_base; + struct hlist_head peer_keepalive[RXRPC_KEEPALIVE_TIME + 1]; + struct hlist_head peer_keepalive_new; + struct timer_list peer_keepalive_timer; + struct work_struct peer_keepalive_work; }; /* @@ -285,6 +293,8 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct work_struct error_distributor; struct rb_root service_conns; /* Service connections */ + struct hlist_node keepalive_link; /* Link in net->peer_keepalive[] */ + time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ unsigned int if_mtu; /* interface MTU for this peer */ @@ -1026,6 +1036,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); +void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c @@ -1034,6 +1045,7 @@ void rxrpc_error_report(struct sock *); void rxrpc_peer_error_distributor(struct work_struct *); void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); +void rxrpc_peer_keepalive_worker(struct work_struct *); /* * peer_object.c diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index d2ec3fd593e8..c717152070df 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -136,6 +136,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, } kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); + conn->params.peer->last_tx_at = ktime_get_real(); _leave(""); return; } @@ -239,6 +240,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); + _leave(" = 0"); return 0; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 2a868fdab0ae..d4f2509e018b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1183,6 +1183,8 @@ void rxrpc_data_ready(struct sock *udp_sk) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: + if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED)) + goto discard; rxrpc_post_packet_to_local(local, skb); goto out; diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index f18c9248e0d4..66baf2b80b6c 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -32,13 +32,22 @@ static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) rxrpc_queue_work(&rxnet->service_conn_reaper); } +static void rxrpc_peer_keepalive_timeout(struct timer_list *timer) +{ + struct rxrpc_net *rxnet = + container_of(timer, struct rxrpc_net, peer_keepalive_timer); + + if (rxnet->live) + rxrpc_queue_work(&rxnet->peer_keepalive_work); +} + /* * Initialise a per-network namespace record. */ static __net_init int rxrpc_init_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); - int ret; + int ret, i; rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); @@ -70,8 +79,16 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); + hash_init(rxnet->peer_hash); spin_lock_init(&rxnet->peer_hash_lock); + for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) + INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]); + INIT_HLIST_HEAD(&rxnet->peer_keepalive_new); + timer_setup(&rxnet->peer_keepalive_timer, + rxrpc_peer_keepalive_timeout, 0); + INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); + rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC); ret = -ENOMEM; rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); @@ -95,6 +112,8 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; + del_timer_sync(&rxnet->peer_keepalive_timer); + cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_locals(rxnet); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index cf73dc006c3b..7f1fc04775b3 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -32,6 +32,8 @@ struct rxrpc_abort_buffer { __be32 abort_code; }; +static const char rxrpc_keepalive_string[] = ""; + /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep @@ -122,6 +124,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; + ktime_t now; size_t len, n; int ret; u8 reason; @@ -203,8 +206,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, } ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + now = ktime_get_real(); if (ping) - call->ping_time = ktime_get_real(); + call->ping_time = now; + conn->params.peer->last_tx_at = ktime_get_real(); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { @@ -288,6 +293,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, sizeof(pkt)); + conn->params.peer->last_tx_at = ktime_get_real(); rxrpc_put_connection(conn); return ret; @@ -378,6 +384,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * message and update the peer record */ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); up_read(&conn->params.local->defrag_sem); if (ret == -EMSGSIZE) @@ -429,6 +436,7 @@ send_fragmentable: if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); opt = IP_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, SOL_IP, @@ -446,6 +454,7 @@ send_fragmentable: if (ret == 0) { ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + conn->params.peer->last_tx_at = ktime_get_real(); opt = IPV6_PMTUDISC_DO; kernel_setsockopt(conn->params.local->socket, @@ -515,3 +524,51 @@ void rxrpc_reject_packets(struct rxrpc_local *local) _leave(""); } + +/* + * Send a VERSION reply to a peer as a keepalive. + */ +void rxrpc_send_keepalive(struct rxrpc_peer *peer) +{ + struct rxrpc_wire_header whdr; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + msg.msg_name = &peer->srx.transport; + msg.msg_namelen = peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + whdr.epoch = htonl(peer->local->rxnet->epoch); + whdr.cid = 0; + whdr.callNumber = 0; + whdr.seq = 0; + whdr.serial = 0; + whdr.type = RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */ + whdr.flags = RXRPC_LAST_PACKET; + whdr.userStatus = 0; + whdr.securityIndex = 0; + whdr._rsvd = 0; + whdr.serviceId = 0; + + iov[0].iov_base = &whdr; + iov[0].iov_len = sizeof(whdr); + iov[1].iov_base = (char *)rxrpc_keepalive_string; + iov[1].iov_len = sizeof(rxrpc_keepalive_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (keepalive)"); + + ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + peer->last_tx_at = ktime_get_real(); + _leave(""); +} diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 7f749505e699..d01eb9a06448 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -348,3 +348,99 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt, usage, avg); } + +/* + * Perform keep-alive pings with VERSION packets to keep any NAT alive. + */ +void rxrpc_peer_keepalive_worker(struct work_struct *work) +{ + struct rxrpc_net *rxnet = + container_of(work, struct rxrpc_net, peer_keepalive_work); + struct rxrpc_peer *peer; + unsigned long delay; + ktime_t base, now = ktime_get_real(); + s64 diff; + u8 cursor, slot; + + base = rxnet->peer_keepalive_base; + cursor = rxnet->peer_keepalive_cursor; + + _enter("%u,%lld", cursor, ktime_sub(now, base)); + +next_bucket: + diff = ktime_to_ns(ktime_sub(now, base)); + if (diff < 0) + goto resched; + + _debug("at %u", cursor); + spin_lock_bh(&rxnet->peer_hash_lock); +next_peer: + if (!rxnet->live) { + spin_unlock_bh(&rxnet->peer_hash_lock); + goto out; + } + + /* Everything in the bucket at the cursor is processed this second; the + * bucket at cursor + 1 goes now + 1s and so on... + */ + if (hlist_empty(&rxnet->peer_keepalive[cursor])) { + if (hlist_empty(&rxnet->peer_keepalive_new)) { + spin_unlock_bh(&rxnet->peer_hash_lock); + goto emptied_bucket; + } + + hlist_move_list(&rxnet->peer_keepalive_new, + &rxnet->peer_keepalive[cursor]); + } + + peer = hlist_entry(rxnet->peer_keepalive[cursor].first, + struct rxrpc_peer, keepalive_link); + hlist_del_init(&peer->keepalive_link); + if (!rxrpc_get_peer_maybe(peer)) + goto next_peer; + + spin_unlock_bh(&rxnet->peer_hash_lock); + + _debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport); + +recalc: + diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC); + if (diff < -30 || diff > 30) + goto send; /* LSW of 64-bit time probably wrapped on 32-bit */ + diff += RXRPC_KEEPALIVE_TIME - 1; + if (diff < 0) + goto send; + + slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff; + if (slot == 0) + goto send; + + /* A transmission to this peer occurred since last we examined it so + * put it into the appropriate future bucket. + */ + slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive); + spin_lock_bh(&rxnet->peer_hash_lock); + hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]); + rxrpc_put_peer(peer); + goto next_peer; + +send: + rxrpc_send_keepalive(peer); + now = ktime_get_real(); + goto recalc; + +emptied_bucket: + cursor++; + if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive)) + cursor = 0; + base = ktime_add_ns(base, NSEC_PER_SEC); + goto next_bucket; + +resched: + rxnet->peer_keepalive_base = base; + rxnet->peer_keepalive_cursor = cursor; + delay = nsecs_to_jiffies(-diff) + 1; + timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay); +out: + _leave(""); +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index d02a99f37f5f..94a6dbfcf129 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -322,6 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local, if (!peer) { peer = prealloc; hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); + hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock(&rxnet->peer_hash_lock); @@ -363,9 +364,12 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; - if (!peer) + if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); + hlist_add_head(&candidate->keepalive_link, + &rxnet->peer_keepalive_new); + } spin_unlock_bh(&rxnet->peer_hash_lock); @@ -392,6 +396,7 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); + hlist_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); kfree_rcu(peer, rcu); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 77cb23c7bd0a..588fea0dd362 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -668,6 +668,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); _leave(" = 0"); return 0; } @@ -722,6 +723,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn, return -EAGAIN; } + conn->params.peer->last_tx_at = ktime_get_real(); _leave(" = 0"); return 0; } -- cgit v1.2.3-70-g09d2 From f82eb88b0fa6943f58760fd7c3d1b12c1f9cf492 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:04:43 +0100 Subject: rxrpc: Fix a bit of time confusion The rxrpc_reduce_call_timer() function should be passed the 'current time' in jiffies, not the current ktime time. It's confusing in rxrpc_resend because that has to deal with both. Pass the correct current time in. Note that this only affects the trace produced and not the functioning of the code. Fixes: a158bdd3247b ("rxrpc: Fix call timeouts") Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 6a62e42e1d8d..3dee89c7a06e 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -238,7 +238,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * retransmitting data. */ if (!retrans) { - rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_reduce_call_timer(call, resend_at, now_j, rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); -- cgit v1.2.3-70-g09d2 From 03877bf6a30cca7d4bc3ffabd3c3e9464a7a1a19 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:04:43 +0100 Subject: rxrpc: Fix Tx ring annotation after initial Tx failure rxrpc calls have a ring of packets that are awaiting ACK or retransmission and a parallel ring of annotations that tracks the state of those packets. If the initial transmission of a packet on the underlying UDP socket fails then the packet annotation is marked for resend - but the setting of this mark accidentally erases the last-packet mark also stored in the same annotation slot. If this happens, a call won't switch out of the Tx phase when all the packets have been transmitted. Fix this by retaining the last-packet mark and only altering the packet state. Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code") Signed-off-by: David Howells --- net/rxrpc/sendmsg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 8503f279b467..783c777fc6e7 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -130,7 +130,9 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) spin_lock_bh(&call->lock); if (call->state < RXRPC_CALL_COMPLETE) { - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS; + call->rxtx_annotations[ix] = + (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) | + RXRPC_TX_ANNO_RETRANS; if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } -- cgit v1.2.3-70-g09d2 From 57b0c9d49b94bbeb53649b7fbd264603c1ebd585 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:04:44 +0100 Subject: rxrpc: Don't treat call aborts as conn aborts If a call-level abort is received for the previous call to complete on a connection channel, then that abort is queued for the connection processor to handle. Unfortunately, the connection processor then assumes without checking that the abort is connection-level (ie. callNumber is 0) and distributes it over all active calls on that connection, thereby incorrectly aborting them. Fix this by discarding aborts aimed at a completed call. Further, discard all packets aimed at a call that's complete if there's currently an active call on a channel, since the DATA packets associated with the new call automatically terminate the old call. Fixes: 18bfeba50dfd ("rxrpc: Perform terminal call ACK/ABORT retransmission from conn processor") Reported-by: Marc Dionne Signed-off-by: David Howells --- net/rxrpc/input.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index d4f2509e018b..21800e6f5019 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1242,16 +1242,19 @@ void rxrpc_data_ready(struct sock *udp_sk) goto discard_unlock; if (sp->hdr.callNumber == chan->last_call) { - /* For the previous service call, if completed successfully, we - * discard all further packets. + if (chan->call || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) + goto discard_unlock; + + /* For the previous service call, if completed + * successfully, we discard all further packets. */ if (rxrpc_conn_is_service(conn) && - (chan->last_type == RXRPC_PACKET_TYPE_ACK || - sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)) + chan->last_type == RXRPC_PACKET_TYPE_ACK) goto discard_unlock; - /* But otherwise we need to retransmit the final packet from - * data cached in the connection record. + /* But otherwise we need to retransmit the final packet + * from data cached in the connection record. */ rxrpc_post_packet_to_conn(conn, skb); goto out_unlock; -- cgit v1.2.3-70-g09d2 From 59299aa1028fce051adbd25aaff7c387b865cd6d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 30 Mar 2018 21:04:44 +0100 Subject: rxrpc: Fix resend event time calculation Commit a158bdd3 ("rxrpc: Fix call timeouts") reworked the time calculation for the next resend event. For this calculation, "oldest" will be before "now", so ktime_sub(oldest, now) will yield a negative value. When passed to nsecs_to_jiffies which expects an unsigned value, the end result will be a very large value, and a resend event scheduled far into the future. This could cause calls to stall if some packets were lost. Fix by ordering the arguments to ktime_sub correctly. Fixes: a158bdd3247b ("rxrpc: Fix call timeouts") Signed-off-by: Marc Dionne Signed-off-by: David Howells --- net/rxrpc/call_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3dee89c7a06e..6e0d788b4dc4 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -226,7 +226,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) ktime_to_ns(ktime_sub(skb->tstamp, max_age))); } - resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now))); + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); resend_at += jiffies + rxrpc_resend_timeout; WRITE_ONCE(call->resend_at, resend_at); -- cgit v1.2.3-70-g09d2 From edb63e2b271752a9424a3d33cfcd4f434a020f9b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 30 Mar 2018 21:04:44 +0100 Subject: rxrpc: remove unused static variables The rxrpc_security_methods and rxrpc_security_sem user has been removed in 648af7fca159 ("rxrpc: Absorb the rxkad security module"). This was noticed by kbuild test robot for the -RT tree but is also true for !RT. Reported-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David Howells --- net/rxrpc/security.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index e9f428351293..c4479afe8ae7 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -19,9 +19,6 @@ #include #include "ar-internal.h" -static LIST_HEAD(rxrpc_security_methods); -static DECLARE_RWSEM(rxrpc_security_sem); - static const struct rxrpc_security *rxrpc_security_types[] = { [RXRPC_SECURITY_NONE] = &rxrpc_no_security, #ifdef CONFIG_RXKAD -- cgit v1.2.3-70-g09d2 From 88f2a8257c9aa7df957b1a79a104f348d60d8027 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:17 +0100 Subject: rxrpc: Fix checker warnings and errors Fix various issues detected by checker. Errors: (*) rxrpc_discard_prealloc() should be using rcu_assign_pointer to set call->socket. Warnings: (*) rxrpc_service_connection_reaper() should be passing NULL rather than 0 to trace_rxrpc_conn() as the where argument. (*) rxrpc_disconnect_client_call() should get its net pointer via the call->conn rather than call->sock to avoid a warning about accessing an RCU pointer without protection. (*) Proc seq start/stop functions need annotation as they pass locks between the functions. False positives: (*) Checker doesn't correctly handle of seq-retry lock context balance in rxrpc_find_service_conn_rcu(). (*) Checker thinks execution may proceed past the BUG() in rxrpc_publish_service_conn(). (*) Variable length array warnings from SKCIPHER_REQUEST_ON_STACK() in rxkad.c. Signed-off-by: David Howells --- net/rxrpc/call_accept.c | 3 ++- net/rxrpc/call_object.c | 1 + net/rxrpc/conn_client.c | 2 +- net/rxrpc/conn_object.c | 2 +- net/rxrpc/proc.c | 6 ++++++ net/rxrpc/sendmsg.c | 2 ++ 6 files changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 92ebd1d7e0bb..4ce24c000653 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -225,7 +225,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->call_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; - call->socket = rx; + rcu_assign_pointer(call->socket, rx); if (rx->discard_new_call) { _debug("discard %lx", call->user_call_ID); rx->discard_new_call(call, call->user_call_ID); @@ -456,6 +456,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx, unsigned long user_call_ID, rxrpc_notify_rx_t notify_rx) __releases(&rx->sk.sk_lock.slock) + __acquires(call->user_mutex) { struct rxrpc_call *call; struct rb_node *parent, **pp; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 147657dfe757..85b12c472522 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -219,6 +219,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, gfp_t gfp, unsigned int debug_id) __releases(&rx->sk.sk_lock.slock) + __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 064175068059..041da40dbf93 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -776,7 +776,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call) unsigned int channel = call->cid & RXRPC_CHANNELMASK; struct rxrpc_connection *conn = call->conn; struct rxrpc_channel *chan = &conn->channels[channel]; - struct rxrpc_net *rxnet = rxrpc_net(sock_net(&call->socket->sk)); + struct rxrpc_net *rxnet = conn->params.local->rxnet; trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect); call->conn = NULL; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ccbac190add1..bfc46fd69a62 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -418,7 +418,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (atomic_cmpxchg(&conn->usage, 1, 0) != 1) continue; - trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0); + trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index f79f260c6ddc..7e45db058823 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -29,6 +29,8 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = { * generate a list of extant and dead calls in /proc/net/rxrpc_calls */ static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rcu) + __acquires(rxnet->call_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -45,6 +47,8 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->call_lock) + __releases(rcu) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -135,6 +139,7 @@ const struct file_operations rxrpc_call_seq_fops = { * generate a list of extant virtual connections in /proc/net/rxrpc_conns */ static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos) + __acquires(rxnet->conn_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); @@ -151,6 +156,7 @@ static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v, } static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v) + __releases(rxnet->conn_lock) { struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 783c777fc6e7..a62980a80151 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -556,6 +556,7 @@ static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) __releases(&rx->sk.sk_lock.slock) + __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; struct rxrpc_call *call; @@ -596,6 +597,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) + __releases(&call->user_mutex) { enum rxrpc_call_state state; struct rxrpc_call *call; -- cgit v1.2.3-70-g09d2 From d3be4d244330f7ef53242d8dc1b7f77d105e767f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:23 +0100 Subject: rxrpc: Fix potential call vs socket/net destruction race rxrpc_call structs don't pin sockets or network namespaces, but may attempt to access both after their refcount reaches 0 so that they can detach themselves from the network namespace. However, there's no guarantee that the socket still exists at this point (so sock_net(&call->socket->sk) may be invalid) and the namespace may have gone away if the call isn't pinning a peer. Fix this by (a) carrying a net pointer in the rxrpc_call struct and (b) waiting for all calls to be destroyed when the network namespace goes away. This was detected by checker: net/rxrpc/call_object.c:634:57: warning: incorrect type in argument 1 (different address spaces) net/rxrpc/call_object.c:634:57: expected struct sock const *sk net/rxrpc/call_object.c:634:57: got struct sock [noderef] * Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_accept.c | 1 + net/rxrpc/call_object.c | 16 +++++++++++++--- net/rxrpc/net_ns.c | 1 + 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 8a348e0a9d95..2a2b0fdfb157 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -75,6 +75,7 @@ struct rxrpc_net { u32 epoch; /* Local epoch for detecting local-end reset */ struct list_head calls; /* List of calls active in this namespace */ rwlock_t call_lock; /* Lock for ->calls */ + atomic_t nr_calls; /* Count of allocated calls */ struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ @@ -528,6 +529,7 @@ struct rxrpc_call { struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ struct rxrpc_sock __rcu *socket; /* socket responsible */ + struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct mutex user_mutex; /* User access mutex */ unsigned long ack_at; /* When deferred ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 4ce24c000653..493545033e42 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -138,6 +138,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); + rxnet = call->rxnet; write_lock(&rxnet->call_lock); list_add_tail(&call->link, &rxnet->calls); write_unlock(&rxnet->call_lock); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 85b12c472522..f721c2b7e234 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -103,6 +103,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; + struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); call = kmem_cache_zalloc(rxrpc_call_jar, gfp); if (!call) @@ -153,6 +154,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->cong_cwnd = 2; call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; + + call->rxnet = rxnet; + atomic_inc(&rxnet->nr_calls); return call; nomem_2: @@ -222,7 +226,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; - struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); + struct rxrpc_net *rxnet; struct rb_node *parent, **pp; const void *here = __builtin_return_address(0); int ret; @@ -272,6 +276,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); + rxnet = call->rxnet; write_lock(&rxnet->call_lock); list_add_tail(&call->link, &rxnet->calls); write_unlock(&rxnet->call_lock); @@ -617,7 +622,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) */ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) { - struct rxrpc_net *rxnet; + struct rxrpc_net *rxnet = call->rxnet; const void *here = __builtin_return_address(0); int n; @@ -631,7 +636,6 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { - rxnet = rxrpc_net(sock_net(&call->socket->sk)); write_lock(&rxnet->call_lock); list_del_init(&call->link); write_unlock(&rxnet->call_lock); @@ -647,11 +651,14 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) { struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_net *rxnet = call->rxnet; rxrpc_put_peer(call->peer); kfree(call->rxtx_buffer); kfree(call->rxtx_annotations); kmem_cache_free(rxrpc_call_jar, call); + if (atomic_dec_and_test(&rxnet->nr_calls)) + wake_up_atomic_t(&rxnet->nr_calls); } /* @@ -716,4 +723,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) } write_unlock(&rxnet->call_lock); + + atomic_dec(&rxnet->nr_calls); + wait_on_atomic_t(&rxnet->nr_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE); } diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 66baf2b80b6c..101019b0be34 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -55,6 +55,7 @@ static __net_init int rxrpc_init_net(struct net *net) INIT_LIST_HEAD(&rxnet->calls); rwlock_init(&rxnet->call_lock); + atomic_set(&rxnet->nr_calls, 1); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); -- cgit v1.2.3-70-g09d2 From 09d2bf595db4b4075ea721acd61e180d6bb18f88 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:28 +0100 Subject: rxrpc: Add a tracepoint to track rxrpc_local refcounting Add a tracepoint to track reference counting on the rxrpc_local struct. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 43 +++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 27 +++--------------- net/rxrpc/call_accept.c | 3 +- net/rxrpc/local_object.c | 65 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 111 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2ea788f6f95d..0410dfeb79c6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -42,6 +42,14 @@ enum rxrpc_skb_trace { rxrpc_skb_tx_seen, }; +enum rxrpc_local_trace { + rxrpc_local_got, + rxrpc_local_new, + rxrpc_local_processing, + rxrpc_local_put, + rxrpc_local_queued, +}; + enum rxrpc_conn_trace { rxrpc_conn_got, rxrpc_conn_new_client, @@ -215,6 +223,13 @@ enum rxrpc_congest_change { EM(rxrpc_skb_tx_rotated, "Tx ROT") \ E_(rxrpc_skb_tx_seen, "Tx SEE") +#define rxrpc_local_traces \ + EM(rxrpc_local_got, "GOT") \ + EM(rxrpc_local_new, "NEW") \ + EM(rxrpc_local_processing, "PRO") \ + EM(rxrpc_local_put, "PUT") \ + E_(rxrpc_local_queued, "QUE") + #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ EM(rxrpc_conn_new_client, "NWc") \ @@ -416,6 +431,7 @@ enum rxrpc_congest_change { #define E_(a, b) TRACE_DEFINE_ENUM(a); rxrpc_skb_traces; +rxrpc_local_traces; rxrpc_conn_traces; rxrpc_client_traces; rxrpc_call_traces; @@ -439,6 +455,33 @@ rxrpc_congest_changes; #define EM(a, b) { a, b }, #define E_(a, b) { a, b } +TRACE_EVENT(rxrpc_local, + TP_PROTO(struct rxrpc_local *local, enum rxrpc_local_trace op, + int usage, const void *where), + + TP_ARGS(local, op, usage, where), + + TP_STRUCT__entry( + __field(unsigned int, local ) + __field(int, op ) + __field(int, usage ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->local = local->debug_id; + __entry->op = op; + __entry->usage = usage; + __entry->where = where; + ), + + TP_printk("L=%08x %s u=%d sp=%pSR", + __entry->local, + __print_symbolic(__entry->op, rxrpc_local_traces), + __entry->usage, + __entry->where) + ); + TRACE_EVENT(rxrpc_conn, TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op, int usage, const void *where), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2a2b0fdfb157..cc51d3eb0548 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -981,31 +981,12 @@ extern void rxrpc_process_local_events(struct rxrpc_local *); * local_object.c */ struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); -void __rxrpc_put_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *); +void rxrpc_put_local(struct rxrpc_local *); +void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); -static inline void rxrpc_get_local(struct rxrpc_local *local) -{ - atomic_inc(&local->usage); -} - -static inline -struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) -{ - return atomic_inc_not_zero(&local->usage) ? local : NULL; -} - -static inline void rxrpc_put_local(struct rxrpc_local *local) -{ - if (local && atomic_dec_and_test(&local->usage)) - __rxrpc_put_local(local); -} - -static inline void rxrpc_queue_local(struct rxrpc_local *local) -{ - rxrpc_queue_work(&local->processor); -} - /* * misc.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 493545033e42..5a9b1d916124 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -296,8 +296,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - rxrpc_get_local(local); - conn->params.local = local; + conn->params.local = rxrpc_get_local(local); conn->params.peer = peer; rxrpc_see_connection(conn); rxrpc_new_incoming_connection(rx, conn, skb); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 38b99db30e54..8b54e9531d52 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -95,6 +95,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; + trace_rxrpc_local(local, rxrpc_local_new, 1, NULL); } _leave(" = %p", local); @@ -256,15 +257,74 @@ addr_in_use: return ERR_PTR(-EADDRINUSE); } +/* + * Get a ref on a local endpoint. + */ +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_inc_return(&local->usage); + trace_rxrpc_local(local, rxrpc_local_got, n, here); + return local; +} + +/* + * Get a ref on a local endpoint unless its usage has already reached 0. + */ +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + + if (local) { + int n = __atomic_add_unless(&local->usage, 1, 0); + if (n > 0) + trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); + else + local = NULL; + } + return local; +} + +/* + * Queue a local endpoint. + */ +void rxrpc_queue_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + + if (rxrpc_queue_work(&local->processor)) + trace_rxrpc_local(local, rxrpc_local_queued, + atomic_read(&local->usage), here); +} + /* * A local endpoint reached its end of life. */ -void __rxrpc_put_local(struct rxrpc_local *local) +static void __rxrpc_put_local(struct rxrpc_local *local) { _enter("%d", local->debug_id); rxrpc_queue_work(&local->processor); } +/* + * Drop a ref on a local endpoint. + */ +void rxrpc_put_local(struct rxrpc_local *local) +{ + const void *here = __builtin_return_address(0); + int n; + + if (local) { + n = atomic_dec_return(&local->usage); + trace_rxrpc_local(local, rxrpc_local_put, n, here); + + if (n == 0) + __rxrpc_put_local(local); + } +} + /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. @@ -322,7 +382,8 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; - _enter("%d", local->debug_id); + trace_rxrpc_local(local, rxrpc_local_processing, + atomic_read(&local->usage), NULL); do { again = false; -- cgit v1.2.3-70-g09d2 From 31f5f9a1691ebef2113c8bdb3edcb8859f30f702 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:33 +0100 Subject: rxrpc: Fix apparent leak of rxrpc_local objects rxrpc_local objects cannot be disposed of until all the connections that point to them have been RCU'd as a connection object holds refcount on the local endpoint it is communicating through. Currently, this can cause an assertion failure to occur when a network namespace is destroyed as there's no check that the RCU destructors for the connections have been run before we start trying to destroy local endpoints. The kernel reports: rxrpc: AF_RXRPC: Leaked local 0000000036a41bc1 {5} ------------[ cut here ]------------ kernel BUG at ../net/rxrpc/local_object.c:439! Fix this by keeping a count of the live connections and waiting for it to go to zero at the end of rxrpc_destroy_all_connections(). Fixes: dee46364ce6f ("rxrpc: Add RCU destruction for connections and calls") Signed-off-by: David Howells --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_accept.c | 2 ++ net/rxrpc/conn_client.c | 1 + net/rxrpc/conn_object.c | 8 ++++++++ net/rxrpc/conn_service.c | 1 + net/rxrpc/net_ns.c | 1 + 6 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cc51d3eb0548..d40d54b78567 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -77,6 +77,7 @@ struct rxrpc_net { rwlock_t call_lock; /* Lock for ->calls */ atomic_t nr_calls; /* Count of allocated calls */ + atomic_t nr_conns; struct list_head conn_proc_list; /* List of conns in this namespace for proc */ struct list_head service_conns; /* Service conns in this namespace */ rwlock_t conn_lock; /* Lock for ->conn_proc_list, ->service_conns */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 5a9b1d916124..f67017dcb25e 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -219,6 +219,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) list_del(&conn->proc_link); write_unlock(&rxnet->conn_lock); kfree(conn); + if (atomic_dec_and_test(&rxnet->nr_conns)) + wake_up_atomic_t(&rxnet->nr_conns); tail = (tail + 1) & (size - 1); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 041da40dbf93..5736f643c516 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -207,6 +207,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp) if (ret < 0) goto error_2; + atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index bfc46fd69a62..0950ee3d26f5 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -365,6 +365,9 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) key_put(conn->params.key); key_put(conn->server_key); rxrpc_put_peer(conn->params.peer); + + if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns)) + wake_up_atomic_t(&conn->params.local->rxnet->nr_conns); rxrpc_put_local(conn->params.local); kfree(conn); @@ -458,6 +461,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) _enter(""); + atomic_dec(&rxnet->nr_conns); rxrpc_destroy_all_client_connections(rxnet); del_timer_sync(&rxnet->service_conn_reap_timer); @@ -475,5 +479,9 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) ASSERT(list_empty(&rxnet->conn_proc_list)); + /* We need to wait for the connections to be destroyed by RCU as they + * pin things that we still need to get rid of. + */ + wait_on_atomic_t(&rxnet->nr_conns, atomic_t_wait, TASK_UNINTERRUPTIBLE); _leave(""); } diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index f6fcdb3130a1..80773a50c755 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -132,6 +132,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn conn->state = RXRPC_CONN_SERVICE_PREALLOC; atomic_set(&conn->usage, 2); + atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); list_add_tail(&conn->link, &rxnet->service_conns); list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 101019b0be34..fa9ce60e7bfa 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -57,6 +57,7 @@ static __net_init int rxrpc_init_net(struct net *net) rwlock_init(&rxnet->call_lock); atomic_set(&rxnet->nr_calls, 1); + atomic_set(&rxnet->nr_conns, 1); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); -- cgit v1.2.3-70-g09d2 From 1159d4b496f57d5b8ee27c8b90b9d01c332e2e11 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:38 +0100 Subject: rxrpc: Add a tracepoint to track rxrpc_peer refcounting Add a tracepoint to track reference counting on the rxrpc_peer struct. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 42 ++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 23 +++------------- net/rxrpc/peer_event.c | 2 +- net/rxrpc/peer_object.c | 65 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 110 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0410dfeb79c6..9e96c2fe2793 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -50,6 +50,14 @@ enum rxrpc_local_trace { rxrpc_local_queued, }; +enum rxrpc_peer_trace { + rxrpc_peer_got, + rxrpc_peer_new, + rxrpc_peer_processing, + rxrpc_peer_put, + rxrpc_peer_queued_error, +}; + enum rxrpc_conn_trace { rxrpc_conn_got, rxrpc_conn_new_client, @@ -230,6 +238,13 @@ enum rxrpc_congest_change { EM(rxrpc_local_put, "PUT") \ E_(rxrpc_local_queued, "QUE") +#define rxrpc_peer_traces \ + EM(rxrpc_peer_got, "GOT") \ + EM(rxrpc_peer_new, "NEW") \ + EM(rxrpc_peer_processing, "PRO") \ + EM(rxrpc_peer_put, "PUT") \ + E_(rxrpc_peer_queued_error, "QER") + #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ EM(rxrpc_conn_new_client, "NWc") \ @@ -482,6 +497,33 @@ TRACE_EVENT(rxrpc_local, __entry->where) ); +TRACE_EVENT(rxrpc_peer, + TP_PROTO(struct rxrpc_peer *peer, enum rxrpc_peer_trace op, + int usage, const void *where), + + TP_ARGS(peer, op, usage, where), + + TP_STRUCT__entry( + __field(unsigned int, peer ) + __field(int, op ) + __field(int, usage ) + __field(const void *, where ) + ), + + TP_fast_assign( + __entry->peer = peer->debug_id; + __entry->op = op; + __entry->usage = usage; + __entry->where = where; + ), + + TP_printk("P=%08x %s u=%d sp=%pSR", + __entry->peer, + __print_symbolic(__entry->op, rxrpc_peer_traces), + __entry->usage, + __entry->where) + ); + TRACE_EVENT(rxrpc_conn, TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op, int usage, const void *where), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d40d54b78567..c46583bc255d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1041,25 +1041,10 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); - -static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) -{ - atomic_inc(&peer->usage); - return peer; -} - -static inline -struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) -{ - return atomic_inc_not_zero(&peer->usage) ? peer : NULL; -} - -extern void __rxrpc_put_peer(struct rxrpc_peer *peer); -static inline void rxrpc_put_peer(struct rxrpc_peer *peer) -{ - if (peer && atomic_dec_and_test(&peer->usage)) - __rxrpc_put_peer(peer); -} +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); +void rxrpc_put_peer(struct rxrpc_peer *); +void __rxrpc_queue_peer_error(struct rxrpc_peer *); /* * proc.c diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index d01eb9a06448..78c2f95d1f22 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -192,7 +192,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_free_skb(skb, rxrpc_skb_rx_freed); /* The ref we obtained is passed off to the work item */ - rxrpc_queue_work(&peer->error_distributor); + __rxrpc_queue_peer_error(peer); _leave(""); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 94a6dbfcf129..a4a750aea1e5 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -386,9 +386,54 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, } /* - * Discard a ref on a remote peer record. + * Get a ref on a peer record. */ -void __rxrpc_put_peer(struct rxrpc_peer *peer) +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_inc_return(&peer->usage); + trace_rxrpc_peer(peer, rxrpc_peer_got, n, here); + return peer; +} + +/* + * Get a ref on a peer record unless its usage has already reached 0. + */ +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + + if (peer) { + int n = __atomic_add_unless(&peer->usage, 1, 0); + if (n > 0) + trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here); + else + peer = NULL; + } + return peer; +} + +/* + * Queue a peer record. This passes the caller's ref to the workqueue. + */ +void __rxrpc_queue_peer_error(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + n = atomic_read(&peer->usage); + if (rxrpc_queue_work(&peer->error_distributor)) + trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here); + else + rxrpc_put_peer(peer); +} + +/* + * Discard a peer record. + */ +static void __rxrpc_put_peer(struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = peer->local->rxnet; @@ -402,6 +447,22 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer) kfree_rcu(peer, rcu); } +/* + * Drop a ref on a peer record. + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + const void *here = __builtin_return_address(0); + int n; + + if (peer) { + n = atomic_dec_return(&peer->usage); + trace_rxrpc_peer(peer, rxrpc_peer_put, n, here); + if (n == 0) + __rxrpc_put_peer(peer); + } +} + /** * rxrpc_kernel_get_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. -- cgit v1.2.3-70-g09d2 From 17226f1240381812c3a4927dc9da2814fb71c8ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Mar 2018 21:05:44 +0100 Subject: rxrpc: Fix leak of rxrpc_peer objects When a new client call is requested, an rxrpc_conn_parameters struct object is passed in with a bunch of parameters set, such as the local endpoint to use. A pointer to the target peer record is also placed in there by rxrpc_get_client_conn() - and this is removed if and only if a new connection object is allocated. Thus it leaks if a new connection object isn't allocated. Fix this by putting any peer object attached to the rxrpc_conn_parameters object in the function that allocated it. Fixes: 19ffa01c9c45 ("rxrpc: Use structs to hold connection params and protocol info") Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 2 ++ net/rxrpc/ar-internal.h | 1 + net/rxrpc/net_ns.c | 1 + net/rxrpc/peer_object.c | 21 +++++++++++++++++++++ net/rxrpc/sendmsg.c | 1 + 5 files changed, 26 insertions(+) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 0b3026b8fa40..9a2c8e7c000e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -324,6 +324,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, mutex_unlock(&call->user_mutex); } + rxrpc_put_peer(cp.peer); _leave(" = %p", call); return call; } @@ -447,6 +448,7 @@ int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call, ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL); mutex_unlock(&call->user_mutex); + rxrpc_put_peer(cp.peer); _leave(" = %d", ret); return ret; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c46583bc255d..90d7079e0aa9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1041,6 +1041,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *, struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *, struct rxrpc_peer *); +void rxrpc_destroy_all_peers(struct rxrpc_net *); struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); void rxrpc_put_peer(struct rxrpc_peer *); diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index fa9ce60e7bfa..c7a023fb22d0 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -118,6 +118,7 @@ static __net_exit void rxrpc_exit_net(struct net *net) cancel_work_sync(&rxnet->peer_keepalive_work); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); + rxrpc_destroy_all_peers(rxnet); rxrpc_destroy_all_locals(rxnet); proc_remove(rxnet->proc_net); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index a4a750aea1e5..1b7e8107b3ae 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -463,6 +463,27 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) } } +/* + * Make sure all peer records have been discarded. + */ +void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) +{ + struct rxrpc_peer *peer; + int i; + + for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) { + if (hlist_empty(&rxnet->peer_hash[i])) + continue; + + hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { + pr_err("Leaked peer %u {%u} %pISp\n", + peer->debug_id, + atomic_read(&peer->usage), + &peer->srx.transport); + } + } +} + /** * rxrpc_kernel_get_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a62980a80151..206e802ccbdc 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -586,6 +586,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ + rxrpc_put_peer(cp.peer); _leave(" = %p\n", call); return call; } -- cgit v1.2.3-70-g09d2 From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:00 -0700 Subject: bpf: Check attach type at prog load time == The problem == There are use-cases when a program of some type can be attached to multiple attach points and those attach points must have different permissions to access context or to call helpers. E.g. context structure may have fields for both IPv4 and IPv6 but it doesn't make sense to read from / write to IPv6 field when attach point is somewhere in IPv4 stack. Same applies to BPF-helpers: it may make sense to call some helper from some attach point, but not from other for same prog type. == The solution == Introduce `expected_attach_type` field in in `struct bpf_attr` for `BPF_PROG_LOAD` command. If scenario described in "The problem" section is the case for some prog type, the field will be checked twice: 1) At load time prog type is checked to see if attach type for it must be known to validate program permissions correctly. Prog will be rejected with EINVAL if it's the case and `expected_attach_type` is not specified or has invalid value. 2) At attach time `attach_type` is compared with `expected_attach_type`, if prog type requires to have one, and, if they differ, attach will be rejected with EINVAL. The `expected_attach_type` is now available as part of `struct bpf_prog` in both `bpf_verifier_ops->is_valid_access()` and `bpf_verifier_ops->get_func_proto()` () and can be used to check context accesses and calls to helpers correspondingly. Initially the idea was discussed by Alexei Starovoitov and Daniel Borkmann here: https://marc.info/?l=linux-netdev&m=152107378717201&w=2 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 ++++- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/cgroup.c | 3 ++- kernel/bpf/syscall.c | 31 ++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 6 +++--- kernel/trace/bpf_trace.c | 27 ++++++++++++++++++--------- net/core/filter.c | 39 +++++++++++++++++++++++++-------------- 8 files changed, 88 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..95a7abd0ee92 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -208,12 +208,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 897ff3d95968..13c044e4832d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,6 +469,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1878201c2d77..102718624d1e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -296,6 +296,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c1c0b60d3f2f..8730b24ed540 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -545,7 +545,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,6 +566,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 95ca2523fa6e..9d3b572d4dec 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,8 +1171,27 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + /* There are currently no prog types that require specifying + * attach_type at load time. + */ + return 0; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1209,11 +1228,16 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1474,6 +1498,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8acd2207e412..10024323031d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1323,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2349,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -5572,7 +5572,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 463e72d18c4c..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -731,7 +737,7 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } @@ -781,7 +787,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -789,12 +796,13 @@ static const struct bpf_func_proto *raw_tp_prog_func_proto(enum bpf_func_id func case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_raw_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool raw_tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { /* largest tracepoint in the kernel has 12 args */ @@ -816,6 +824,7 @@ const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); diff --git a/net/core/filter.c b/net/core/filter.c index e989bf313195..7790fd128614 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3685,7 +3685,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3699,7 +3699,7 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3714,7 +3714,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3781,7 +3781,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3804,7 +3804,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3831,7 +3831,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3847,7 +3847,8 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: @@ -3863,7 +3864,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3888,7 +3890,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3918,11 +3920,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3966,6 +3969,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3986,11 +3990,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4020,11 +4025,12 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4096,6 +4102,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4125,7 +4132,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4142,6 +4149,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4174,6 +4182,7 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4220,6 +4229,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4249,11 +4259,12 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) -- cgit v1.2.3-70-g09d2 From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:02 -0700 Subject: bpf: Hooks for sys_bind == The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 21 ++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/cgroup.c | 36 +++++++ kernel/bpf/syscall.c | 36 +++++-- kernel/bpf/verifier.c | 1 + net/core/filter.c | 232 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 7 ++ net/ipv6/af_inet6.c | 7 ++ 10 files changed, 366 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..67dc4a6471ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6d7243bfb0ff..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) diff --git a/include/linux/filter.h b/include/linux/filter.h index 13c044e4832d..fc4e8f91b03d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 102718624d1e..ce3e69e3c793 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,6 +136,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -147,6 +148,8 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -1010,6 +1013,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 8730b24ed540..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -494,6 +494,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); +/** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + /** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9d3b572d4dec..2cad66a4cacb 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1175,19 +1175,29 @@ static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } } static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { - /* There are currently no prog types that require specifying - * attach_type at load time. - */ - return 0; + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } } /* last field in 'union bpf_attr' used by this command */ @@ -1479,6 +1489,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1541,6 +1555,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET_SOCK_CREATE: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1590,6 +1608,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10024323031d..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3887,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index 7790fd128614..c08e5b121558 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3698,6 +3698,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -4180,6 +4194,69 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -4724,6 +4801,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5181,6 +5404,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e8c7fad8c329..2dec266507dc 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < sizeof(struct sockaddr_in)) goto out; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + goto out; + if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index dbbe04018813..fa24e3f06ac6 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,6 +295,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; -- cgit v1.2.3-70-g09d2 From 3679d585bbc07a1ac4448d5b478b492cad3587ce Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:04 -0700 Subject: net: Introduce __inet_bind() and __inet6_bind Refactor `bind()` code to make it ready to be called from BPF helper function `bpf_bind()` (will be added soon). Implementation of `inet_bind()` and `inet6_bind()` is separated into `__inet_bind()` and `__inet6_bind()` correspondingly. These function can be used from both `sk_prot->bind` and `bpf_bind()` contexts. New functions have two additional arguments. `force_bind_address_no_port` forces binding to IP only w/o checking `inet_sock.bind_address_no_port` field. It'll allow to bind local end of a connection to desired IP in `bpf_bind()` w/o changing `bind_address_no_port` field of a socket. It's useful since `bpf_bind()` can return an error and we'd need to restore original value of `bind_address_no_port` in that case if we changed this before calling to the helper. `with_lock` specifies whether to lock socket when working with `struct sk` or not. The argument is set to `true` for `sk_prot->bind`, i.e. old behavior is preserved. But it will be set to `false` for `bpf_bind()` use-case. The reason is all call-sites, where `bpf_bind()` will be called, already hold that socket lock. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/net/inet_common.h | 2 ++ include/net/ipv6.h | 2 ++ net/ipv4/af_inet.c | 39 ++++++++++++++++++++++++--------------- net/ipv6/af_inet6.c | 37 ++++++++++++++++++++++++------------- 4 files changed, 52 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 500f81375200..384b90c62c0b 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -32,6 +32,8 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 50a6f0ddb878..2e5fedc56e59 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1066,6 +1066,8 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); +int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2dec266507dc..e203a39d6988 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -432,30 +432,37 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - unsigned short snum; - int chk_addr_ret; - u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; + return sk->sk_prot->bind(sk, uaddr, addr_len); } - err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) - goto out; + return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); if (err) - goto out; + return err; + + return __inet_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet_bind); + +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + unsigned short snum; + int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; + int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) @@ -499,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -511,7 +519,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && + if ((snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) && sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; @@ -528,11 +537,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); err = 0; out_release_sock: - release_sock(sk); + if (with_lock) + release_sock(sk); out: return err; } -EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fa24e3f06ac6..13110bee5c14 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -277,15 +277,7 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - __be32 v4addr = 0; - unsigned short snum; - bool saved_ipv6only; - int addr_type = 0; int err = 0; /* If the socket has its own bind function then use it. */ @@ -302,11 +294,28 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + +int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + bool saved_ipv6only; + int addr_type = 0; + int err = 0; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); - if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); @@ -314,7 +323,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { @@ -402,7 +412,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && + if ((snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) && sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); @@ -418,13 +429,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_dport = 0; inet->inet_daddr = 0; out: - release_sock(sk); + if (with_lock) + release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } -EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { -- cgit v1.2.3-70-g09d2 From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:05 -0700 Subject: bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++++++ include/net/addrconf.h | 7 ++++++ include/net/sock.h | 3 +++ include/net/udp.h | 1 + include/uapi/linux/bpf.h | 12 +++++++++- kernel/bpf/syscall.c | 8 +++++++ net/core/filter.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/af_inet.c | 13 +++++++++++ net/ipv4/tcp_ipv4.c | 16 +++++++++++++ net/ipv4/udp.c | 14 ++++++++++++ net/ipv6/af_inet6.c | 5 ++++ net/ipv6/tcp_ipv6.c | 16 +++++++++++++ net/ipv6/udp.c | 20 ++++++++++++++++ 13 files changed, 202 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 67dc4a6471ad..c6ab295e6dcb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ + } \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -151,11 +177,16 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 132e5b95167a..378d601258be 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -231,6 +231,13 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/sock.h b/include/net/sock.h index b8ff435fa96e..49bd2c1796b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1026,6 +1026,9 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); + int (*pre_connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); diff --git a/include/net/udp.h b/include/net/udp.h index 850a8e581cce..0676b272f6ac 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,6 +273,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ce3e69e3c793..77afaf1ba556 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -150,6 +150,8 @@ enum bpf_attach_type { BPF_SK_MSG_VERDICT, BPF_CGROUP_INET4_BIND, BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, __MAX_BPF_ATTACH_TYPE }; @@ -744,6 +746,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -809,7 +818,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2cad66a4cacb..cf1b29bc0ab8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1180,6 +1180,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: return 0; default: return -EINVAL; @@ -1491,6 +1493,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1557,6 +1561,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) break; case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1610,6 +1616,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; diff --git a/net/core/filter.c b/net/core/filter.c index c08e5b121558..bdb9cadd4d27 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -3656,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3707,6 +3754,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -4213,6 +4268,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: break; default: return false; @@ -4221,6 +4277,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: break; default: return false; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e203a39d6988..488fe26ac8e5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -547,12 +547,19 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -633,6 +640,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2c6aec2643e8..3c11d992d784 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2409,6 +2424,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 908fc02fb4f8..9c6c77fec963 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1658,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2530,6 +2543,7 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 13110bee5c14..566cec0e0a44 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -887,6 +887,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1043,6 +1047,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5425d7b100ee..6469b741cf5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1925,6 +1940,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ad30f5e31969..6861ed479469 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1512,6 +1531,7 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, -- cgit v1.2.3-70-g09d2 From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:07 -0700 Subject: bpf: Post-hooks for sys_bind "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result. Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to fail and return EPERM to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port_n, it can be recorded to a map. And later when some TCP client inside same cgroup is trying to connect to 127.0.0.1:port_n, BPF hook for sys_connect can override the destination and connect application to IP:port_n instead of 127.0.0.1:port_n. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they e.g. use localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`. Separate attach types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 16 +++++-- include/uapi/linux/bpf.h | 11 +++++ kernel/bpf/syscall.c | 43 +++++++++++++++++ net/core/filter.c | 116 +++++++++++++++++++++++++++++++++++++++------ net/ipv4/af_inet.c | 18 ++++--- net/ipv6/af_inet6.c | 21 +++++--- 6 files changed, 195 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c6ab295e6dcb..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77afaf1ba556..c5ec89732a8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_BIND, BPF_CGROUP_INET4_CONNECT, BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -948,6 +950,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf1b29bc0ab8..0244973ee544 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1171,11 +1171,46 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + static int bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type) { switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: switch (expected_attach_type) { case BPF_CGROUP_INET4_BIND: @@ -1195,6 +1230,7 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, enum bpf_attach_type attach_type) { switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; default: @@ -1240,6 +1276,7 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) return -EINVAL; @@ -1489,6 +1526,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1557,6 +1596,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; case BPF_CGROUP_INET4_BIND: @@ -1616,6 +1657,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_SOCK_CREATE: case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: diff --git a/net/core/filter.c b/net/core/filter.c index bdb9cadd4d27..d31aff93270d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4097,30 +4097,80 @@ static bool lwt_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - const struct bpf_prog *prog, - struct bpf_insn_access_aux *info) + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; default: return false; } } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); - if (off < 0 || off + size > sizeof(struct bpf_sock)) + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -4728,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4783,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 488fe26ac8e5..142d4c35b493 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -519,12 +519,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 566cec0e0a44..41f50472679d 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -412,13 +412,20 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) -- cgit v1.2.3-70-g09d2 From f40aa23339e2d06b1a8daaece5a511bb1c4f704c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 30 Mar 2018 13:46:18 +0300 Subject: net: bridge: set min MTU on port events and allow user to set max Recently the bridge was changed to automatically set maximum MTU on port events (add/del/changemtu) when vlan filtering is enabled, but that actually changes behaviour in a way which breaks some setups and can lead to packet drops. In order to still allow that maximum to be set while being compatible, we add the ability for the user to tune the bridge MTU up to the maximum when vlan filtering is enabled, but that has to be done explicitly and all port events (add/del/changemtu) lead to resetting that MTU to the minimum as before. Suggested-by: Roopa Prabhu Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br.c | 2 +- net/bridge/br_device.c | 3 ++- net/bridge/br_if.c | 43 ++++++++++++++----------------------------- net/bridge/br_private.h | 2 +- 4 files changed, 18 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 26e1616b2c90..565ff055813b 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v switch (event) { case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br, false)); break; case NETDEV_CHANGEADDR: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 278fc999d355..edb9967eb165 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -224,7 +224,8 @@ static void br_get_stats64(struct net_device *dev, static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - if (new_mtu > br_mtu(br)) + + if (new_mtu > br_mtu(br, br_vlan_enabled(dev))) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 87b2afd455c7..7d5dc5a91084 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -424,41 +424,26 @@ int br_del_bridge(struct net *net, const char *name) return ret; } -static bool min_mtu(int a, int b) -{ - return a < b ? 1 : 0; -} - -static bool max_mtu(int a, int b) -{ - return a > b ? 1 : 0; -} - -/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ -static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int)) +/* MTU of the bridge pseudo-device: ETH_DATA_LEN if there are no ports, the + * minimum of the ports if @max is false or the maximum if it's true + */ +int br_mtu(const struct net_bridge *br, bool max) { const struct net_bridge_port *p; - int mtu = 0; + int ret_mtu = 0; ASSERT_RTNL(); - if (list_empty(&br->port_list)) - mtu = ETH_DATA_LEN; - else { - list_for_each_entry(p, &br->port_list, list) { - if (!mtu || compare_fn(p->dev->mtu, mtu)) - mtu = p->dev->mtu; + list_for_each_entry(p, &br->port_list, list) { + if (!max) { + if (!ret_mtu || ret_mtu > p->dev->mtu) + ret_mtu = p->dev->mtu; + } else if (p->dev->mtu > ret_mtu) { + ret_mtu = p->dev->mtu; } } - return mtu; -} -int br_mtu(const struct net_bridge *br) -{ - if (br_vlan_enabled(br->dev)) - return __br_mtu(br, max_mtu); - else - return __br_mtu(br, min_mtu); + return ret_mtu ? ret_mtu : ETH_DATA_LEN; } static void br_set_gso_limits(struct net_bridge *br) @@ -612,7 +597,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - dev_set_mtu(br->dev, br_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br, false)); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -659,7 +644,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); - dev_set_mtu(br->dev, br_mtu(br)); + dev_set_mtu(br->dev, br_mtu(br, false)); br_set_gso_limits(br); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 048d5b51813b..586f84b9670d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -578,7 +578,7 @@ int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); -int br_mtu(const struct net_bridge *br); +int br_mtu(const struct net_bridge *br, bool max); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); -- cgit v1.2.3-70-g09d2 From 804b854d374e39f5f8bff9638fd274b9a9ca7d33 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 30 Mar 2018 13:46:19 +0300 Subject: net: bridge: disable bridge MTU auto tuning if it was set manually As Roopa noted today the biggest source of problems when configuring bridge and ports is that the bridge MTU keeps changing automatically on port events (add/del/changemtu). That leads to inconsistent behaviour and network config software needs to chase the MTU and fix it on each such event. Let's improve on that situation and allow for the user to set any MTU within ETH_MIN/MAX limits, but once manually configured it is the user's responsibility to keep it correct afterwards. In case the MTU isn't manually set - the behaviour reverts to the previous and the bridge follows the minimum MTU. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br.c | 2 +- net/bridge/br_device.c | 5 ++--- net/bridge/br_if.c | 36 +++++++++++++++++++++--------------- net/bridge/br_private.h | 3 ++- 4 files changed, 26 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 565ff055813b..671d13c10f6f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v switch (event) { case NETDEV_CHANGEMTU: - dev_set_mtu(br->dev, br_mtu(br, false)); + br_mtu_auto_adjust(br); break; case NETDEV_CHANGEADDR: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index edb9967eb165..e682a668ce57 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -225,11 +225,10 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); - if (new_mtu > br_mtu(br, br_vlan_enabled(dev))) - return -EINVAL; - dev->mtu = new_mtu; + /* this flag will be cleared if the MTU was automatically adjusted */ + br->mtu_set_by_user = true; #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 7d5dc5a91084..82c1a6f430b3 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -424,28 +424,34 @@ int br_del_bridge(struct net *net, const char *name) return ret; } -/* MTU of the bridge pseudo-device: ETH_DATA_LEN if there are no ports, the - * minimum of the ports if @max is false or the maximum if it's true - */ -int br_mtu(const struct net_bridge *br, bool max) +/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ +static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; - ASSERT_RTNL(); - - list_for_each_entry(p, &br->port_list, list) { - if (!max) { - if (!ret_mtu || ret_mtu > p->dev->mtu) - ret_mtu = p->dev->mtu; - } else if (p->dev->mtu > ret_mtu) { + list_for_each_entry(p, &br->port_list, list) + if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; - } - } return ret_mtu ? ret_mtu : ETH_DATA_LEN; } +void br_mtu_auto_adjust(struct net_bridge *br) +{ + ASSERT_RTNL(); + + /* if the bridge MTU was manually configured don't mess with it */ + if (br->mtu_set_by_user) + return; + + /* change to the minimum MTU and clear the flag which was set by + * the bridge ndo_change_mtu callback + */ + dev_set_mtu(br->dev, br_mtu_min(br)); + br->mtu_set_by_user = false; +} + static void br_set_gso_limits(struct net_bridge *br) { unsigned int gso_max_size = GSO_MAX_SIZE; @@ -597,7 +603,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - dev_set_mtu(br->dev, br_mtu(br, false)); + br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -644,7 +650,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); - dev_set_mtu(br->dev, br_mtu(br, false)); + br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 586f84b9670d..a7cb3ece5031 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -410,6 +410,7 @@ struct net_bridge { int offload_fwd_mark; #endif bool neigh_suppress_enabled; + bool mtu_set_by_user; struct hlist_head fdb_list; }; @@ -578,7 +579,7 @@ int br_del_bridge(struct net *net, const char *name); int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack); int br_del_if(struct net_bridge *br, struct net_device *dev); -int br_mtu(const struct net_bridge *br, bool max); +void br_mtu_auto_adjust(struct net_bridge *br); netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features); void br_port_flags_change(struct net_bridge_port *port, unsigned long mask); -- cgit v1.2.3-70-g09d2 From 218527fe27adaebeb81eb770459eb335517e90ee Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:41 +0200 Subject: tipc: replace name table service range array with rb tree The current design of the binding table has an unnecessary memory consuming and complex data structure. It aggregates the service range items into an array, which is expanded by a factor two every time it becomes too small to hold a new item. Furthermore, the arrays never shrink when the number of ranges diminishes. We now replace this array with an RB tree that is holding the range items as tree nodes, each range directly holding a list of bindings. This, along with a few name changes, improves both readability and volume of the code, as well as reducing memory consumption and hopefully improving cache hit rate. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.h | 1 + net/tipc/link.c | 2 +- net/tipc/name_table.c | 1032 ++++++++++++++++++++++--------------------------- net/tipc/name_table.h | 2 +- net/tipc/node.c | 4 +- net/tipc/subscr.h | 4 +- 6 files changed, 477 insertions(+), 568 deletions(-) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index d0f64ca62d02..8020a6c360ff 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -58,6 +58,7 @@ #include #include #include +#include struct tipc_node; struct tipc_bearer; diff --git a/net/tipc/link.c b/net/tipc/link.c index 1289b4ba404f..8f2a9496439b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1810,7 +1810,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { - int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); + int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); l->window = win; l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 4359605b1bec..e06c7a8907aa 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -44,52 +44,40 @@ #include "addr.h" #include "node.h" #include "group.h" -#include - -#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ /** - * struct name_info - name sequence publication info - * @node_list: list of publications on own node of this - * @all_publ: list of all publications of this + * struct service_range - container for all bindings of a service range + * @lower: service range lower bound + * @upper: service range upper bound + * @tree_node: member of service range RB tree + * @local_publ: list of identical publications made from this node + * Used by closest_first lookup and multicast lookup algorithm + * @all_publ: all publications identical to this one, whatever node and scope + * Used by round-robin lookup algorithm */ -struct name_info { - struct list_head local_publ; - struct list_head all_publ; -}; - -/** - * struct sub_seq - container for all published instances of a name sequence - * @lower: name sequence lower bound - * @upper: name sequence upper bound - * @info: pointer to name sequence publication info - */ -struct sub_seq { +struct service_range { u32 lower; u32 upper; - struct name_info *info; + struct rb_node tree_node; + struct list_head local_publ; + struct list_head all_publ; }; /** - * struct name_seq - container for all published instances of a name type - * @type: 32 bit 'type' value for name sequence - * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type'; - * sub-sequences are sorted in ascending order - * @alloc: number of sub-sequences currently in array - * @first_free: array index of first unused sub-sequence entry - * @ns_list: links to adjacent name sequences in hash chain - * @subscriptions: list of subscriptions for this 'type' - * @lock: spinlock controlling access to publication lists of all sub-sequences + * struct tipc_service - container for all published instances of a service type + * @type: 32 bit 'type' value for service + * @ranges: rb tree containing all service ranges for this service + * @service_list: links to adjacent name ranges in hash chain + * @subscriptions: list of subscriptions for this service type + * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ -struct name_seq { +struct tipc_service { u32 type; - struct sub_seq *sseqs; - u32 alloc; - u32 first_free; - struct hlist_node ns_list; + struct rb_root ranges; + struct hlist_node service_list; struct list_head subscriptions; - spinlock_t lock; + spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; @@ -99,17 +87,16 @@ static int hash(int x) } /** - * publ_create - create a publication structure + * tipc_publ_create - create a publication structure */ -static struct publication *publ_create(u32 type, u32 lower, u32 upper, - u32 scope, u32 node, u32 port, - u32 key) +static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, + u32 scope, u32 node, u32 port, + u32 key) { struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC); - if (publ == NULL) { - pr_warn("Publication creation failure, no memory\n"); + + if (!publ) return NULL; - } publ->type = type; publ->lower = lower; @@ -119,372 +106,298 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper, publ->port = port; publ->key = key; INIT_LIST_HEAD(&publ->binding_sock); + INIT_LIST_HEAD(&publ->binding_node); + INIT_LIST_HEAD(&publ->local_publ); + INIT_LIST_HEAD(&publ->all_publ); return publ; } /** - * tipc_subseq_alloc - allocate a specified number of sub-sequence structures - */ -static struct sub_seq *tipc_subseq_alloc(u32 cnt) -{ - return kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC); -} - -/** - * tipc_nameseq_create - create a name sequence structure for the specified 'type' + * tipc_service_create - create a service structure for the specified 'type' * - * Allocates a single sub-sequence structure and sets it to all 0's. + * Allocates a single range structure and sets it to all 0's. */ -static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head) +static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) { - struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC); - struct sub_seq *sseq = tipc_subseq_alloc(1); + struct tipc_service *service = kzalloc(sizeof(*service), GFP_ATOMIC); - if (!nseq || !sseq) { - pr_warn("Name sequence creation failed, no memory\n"); - kfree(nseq); - kfree(sseq); + if (!service) { + pr_warn("Service creation failed, no memory\n"); return NULL; } - spin_lock_init(&nseq->lock); - nseq->type = type; - nseq->sseqs = sseq; - nseq->alloc = 1; - INIT_HLIST_NODE(&nseq->ns_list); - INIT_LIST_HEAD(&nseq->subscriptions); - hlist_add_head_rcu(&nseq->ns_list, seq_head); - return nseq; + spin_lock_init(&service->lock); + service->type = type; + service->ranges = RB_ROOT; + INIT_HLIST_NODE(&service->service_list); + INIT_LIST_HEAD(&service->subscriptions); + hlist_add_head_rcu(&service->service_list, hd); + return service; } /** - * nameseq_find_subseq - find sub-sequence (if any) matching a name instance + * tipc_service_find_range - find service range matching a service instance * - * Very time-critical, so binary searches through sub-sequence array. + * Very time-critical, so binary search through range rb tree */ -static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq, - u32 instance) +static struct service_range *tipc_service_find_range(struct tipc_service *sc, + u32 instance) { - struct sub_seq *sseqs = nseq->sseqs; - int low = 0; - int high = nseq->first_free - 1; - int mid; - - while (low <= high) { - mid = (low + high) / 2; - if (instance < sseqs[mid].lower) - high = mid - 1; - else if (instance > sseqs[mid].upper) - low = mid + 1; + struct rb_node *n = sc->ranges.rb_node; + struct service_range *sr; + + while (n) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower > instance) + n = n->rb_left; + else if (sr->upper < instance) + n = n->rb_right; else - return &sseqs[mid]; + return sr; } return NULL; } -/** - * nameseq_locate_subseq - determine position of name instance in sub-sequence - * - * Returns index in sub-sequence array of the entry that contains the specified - * instance value; if no entry contains that value, returns the position - * where a new entry for it would be inserted in the array. - * - * Note: Similar to binary search code for locating a sub-sequence. - */ -static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance) +static struct service_range *tipc_service_create_range(struct tipc_service *sc, + u32 lower, u32 upper) { - struct sub_seq *sseqs = nseq->sseqs; - int low = 0; - int high = nseq->first_free - 1; - int mid; - - while (low <= high) { - mid = (low + high) / 2; - if (instance < sseqs[mid].lower) - high = mid - 1; - else if (instance > sseqs[mid].upper) - low = mid + 1; + struct rb_node **n, *parent = NULL; + struct service_range *sr, *tmp; + + n = &sc->ranges.rb_node; + while (*n) { + tmp = container_of(*n, struct service_range, tree_node); + parent = *n; + tmp = container_of(parent, struct service_range, tree_node); + if (lower < tmp->lower) + n = &(*n)->rb_left; + else if (upper > tmp->upper) + n = &(*n)->rb_right; else - return mid; + return NULL; } - return low; + sr = kzalloc(sizeof(*sr), GFP_ATOMIC); + if (!sr) + return NULL; + sr->lower = lower; + sr->upper = upper; + INIT_LIST_HEAD(&sr->local_publ); + INIT_LIST_HEAD(&sr->all_publ); + rb_link_node(&sr->tree_node, parent, n); + rb_insert_color(&sr->tree_node, &sc->ranges); + return sr; } -/** - * tipc_nameseq_insert_publ - */ -static struct publication *tipc_nameseq_insert_publ(struct net *net, - struct name_seq *nseq, +static struct publication *tipc_service_insert_publ(struct net *net, + struct tipc_service *sc, u32 type, u32 lower, u32 upper, u32 scope, - u32 node, u32 port, u32 key) + u32 node, u32 port, + u32 key) { - struct tipc_subscription *s; - struct tipc_subscription *st; - struct publication *publ; - struct sub_seq *sseq; - struct name_info *info; - int created_subseq = 0; - - sseq = nameseq_find_subseq(nseq, lower); - if (sseq) { - - /* Lower end overlaps existing entry => need an exact match */ - if ((sseq->lower != lower) || (sseq->upper != upper)) { - return NULL; - } - - info = sseq->info; - - /* Check if an identical publication already exists */ - list_for_each_entry(publ, &info->all_publ, all_publ) { - if (publ->port == port && publ->key == key && - (!publ->node || publ->node == node)) - return NULL; - } - } else { - u32 inspos; - struct sub_seq *freesseq; - - /* Find where lower end should be inserted */ - inspos = nameseq_locate_subseq(nseq, lower); - - /* Fail if upper end overlaps into an existing entry */ - if ((inspos < nseq->first_free) && - (upper >= nseq->sseqs[inspos].lower)) { - return NULL; - } - - /* Ensure there is space for new sub-sequence */ - if (nseq->first_free == nseq->alloc) { - struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2); - - if (!sseqs) { - pr_warn("Cannot publish {%u,%u,%u}, no memory\n", - type, lower, upper); - return NULL; - } - memcpy(sseqs, nseq->sseqs, - nseq->alloc * sizeof(struct sub_seq)); - kfree(nseq->sseqs); - nseq->sseqs = sseqs; - nseq->alloc *= 2; - } - - info = kzalloc(sizeof(*info), GFP_ATOMIC); - if (!info) { - pr_warn("Cannot publish {%u,%u,%u}, no memory\n", - type, lower, upper); - return NULL; - } - - INIT_LIST_HEAD(&info->local_publ); - INIT_LIST_HEAD(&info->all_publ); - - /* Insert new sub-sequence */ - sseq = &nseq->sseqs[inspos]; - freesseq = &nseq->sseqs[nseq->first_free]; - memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq)); - memset(sseq, 0, sizeof(*sseq)); - nseq->first_free++; - sseq->lower = lower; - sseq->upper = upper; - sseq->info = info; - created_subseq = 1; + struct tipc_subscription *sub, *tmp; + struct service_range *sr; + struct publication *p; + bool first = false; + + sr = tipc_service_find_range(sc, lower); + if (!sr) { + sr = tipc_service_create_range(sc, lower, upper); + if (!sr) + goto err; + first = true; } - /* Insert a publication */ - publ = publ_create(type, lower, upper, scope, node, port, key); - if (!publ) + /* Lower end overlaps existing entry, but we need an exact match */ + if (sr->lower != lower || sr->upper != upper) return NULL; - list_add(&publ->all_publ, &info->all_publ); + /* Return if the publication already exists */ + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->key == key && (!p->node || p->node == node)) + return NULL; + } + /* Create and insert publication */ + p = tipc_publ_create(type, lower, upper, scope, node, port, key); + if (!p) + goto err; if (in_own_node(net, node)) - list_add(&publ->local_publ, &info->local_publ); + list_add(&p->local_publ, &sr->local_publ); + list_add(&p->all_publ, &sr->all_publ); /* Any subscriptions waiting for notification? */ - list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_sub_report_overlap(s, publ->lower, publ->upper, - TIPC_PUBLISHED, publ->port, - publ->node, publ->scope, - created_subseq); + list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_PUBLISHED, + p->port, p->node, p->scope, first); } - return publ; + return p; +err: + pr_warn("Failed to bind to %u,%u,%u, no memory\n", type, lower, upper); + return NULL; } /** - * tipc_nameseq_remove_publ + * tipc_service_remove_publ - remove a publication from a service * * NOTE: There may be cases where TIPC is asked to remove a publication * that is not in the name table. For example, if another node issues a - * publication for a name sequence that overlaps an existing name sequence + * publication for a name range that overlaps an existing name range * the publication will not be recorded, which means the publication won't - * be found when the name sequence is later withdrawn by that node. + * be found when the name range is later withdrawn by that node. * A failed withdraw request simply returns a failure indication and lets the * caller issue any error or warning messages associated with such a problem. */ -static struct publication *tipc_nameseq_remove_publ(struct net *net, - struct name_seq *nseq, +static struct publication *tipc_service_remove_publ(struct net *net, + struct tipc_service *sc, u32 inst, u32 node, u32 port, u32 key) { - struct publication *publ; - struct sub_seq *sseq = nameseq_find_subseq(nseq, inst); - struct name_info *info; - struct sub_seq *free; - struct tipc_subscription *s, *st; - int removed_subseq = 0; - - if (!sseq) - return NULL; + struct tipc_subscription *sub, *tmp; + struct service_range *sr; + struct publication *p; + bool found = false; + bool last = false; - info = sseq->info; + sr = tipc_service_find_range(sc, inst); + if (!sr) + return NULL; - /* Locate publication, if it exists */ - list_for_each_entry(publ, &info->all_publ, all_publ) { - if (publ->key == key && publ->port == port && - (!publ->node || publ->node == node)) - goto found; + /* Find publication, if it exists */ + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->key != key || (node && node != p->node)) + continue; + found = true; + break; } - return NULL; + if (!found) + return NULL; -found: - list_del(&publ->all_publ); - if (in_own_node(net, node)) - list_del(&publ->local_publ); - - /* Contract subseq list if no more publications for that subseq */ - if (list_empty(&info->all_publ)) { - kfree(info); - free = &nseq->sseqs[nseq->first_free--]; - memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq)); - removed_subseq = 1; + list_del(&p->all_publ); + list_del(&p->local_publ); + + /* Remove service range item if this was its last publication */ + if (list_empty(&sr->all_publ)) { + last = true; + rb_erase(&sr->tree_node, &sc->ranges); + kfree(sr); } /* Notify any waiting subscriptions */ - list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) { - tipc_sub_report_overlap(s, publ->lower, publ->upper, - TIPC_WITHDRAWN, publ->port, - publ->node, publ->scope, - removed_subseq); + list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN, + p->port, p->node, p->scope, last); } - - return publ; + return p; } /** - * tipc_nameseq_subscribe - attach a subscription, and optionally - * issue the prescribed number of events if there is any sub- - * sequence overlapping with the requested sequence + * tipc_service_subscribe - attach a subscription, and optionally + * issue the prescribed number of events if there is any service + * range overlapping with the requested range */ -static void tipc_nameseq_subscribe(struct name_seq *nseq, +static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { - struct sub_seq *sseq = nseq->sseqs; + struct tipc_subscr *sb = &sub->evt.s; + struct service_range *sr; struct tipc_name_seq ns; - struct tipc_subscr *s = &sub->evt.s; - bool no_status; + struct publication *p; + struct rb_node *n; + bool first; - ns.type = tipc_sub_read(s, seq.type); - ns.lower = tipc_sub_read(s, seq.lower); - ns.upper = tipc_sub_read(s, seq.upper); - no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS; + ns.type = tipc_sub_read(sb, seq.type); + ns.lower = tipc_sub_read(sb, seq.lower); + ns.upper = tipc_sub_read(sb, seq.upper); tipc_sub_get(sub); - list_add(&sub->nameseq_list, &nseq->subscriptions); + list_add(&sub->service_list, &service->subscriptions); - if (no_status || !sseq) + if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) return; - while (sseq != &nseq->sseqs[nseq->first_free]) { - if (tipc_sub_check_overlap(&ns, sseq->lower, sseq->upper)) { - struct publication *crs; - struct name_info *info = sseq->info; - int must_report = 1; - - list_for_each_entry(crs, &info->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sseq->lower, - sseq->upper, - TIPC_PUBLISHED, - crs->port, - crs->node, - crs->scope, - must_report); - must_report = 0; - } + for (n = rb_first(&service->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower > ns.upper) + break; + if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) + continue; + first = true; + + list_for_each_entry(p, &sr->all_publ, all_publ) { + tipc_sub_report_overlap(sub, sr->lower, sr->upper, + TIPC_PUBLISHED, p->port, + p->node, p->scope, first); + first = false; } - sseq++; } } -static struct name_seq *nametbl_find_seq(struct net *net, u32 type) +static struct tipc_service *tipc_service_find(struct net *net, u32 type) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct hlist_head *seq_head; - struct name_seq *ns; - - seq_head = &tn->nametbl->seq_hlist[hash(type)]; - hlist_for_each_entry_rcu(ns, seq_head, ns_list) { - if (ns->type == type) - return ns; + struct name_table *nt = tipc_name_table(net); + struct hlist_head *service_head; + struct tipc_service *service; + + service_head = &nt->services[hash(type)]; + hlist_for_each_entry_rcu(service, service_head, service_list) { + if (service->type == type) + return service; } - return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, - u32 lower, u32 upper, u32 scope, - u32 node, u32 port, u32 key) + u32 lower, u32 upper, + u32 scope, u32 node, + u32 port, u32 key) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct publication *publ; - struct name_seq *seq = nametbl_find_seq(net, type); - int index = hash(type); + struct name_table *nt = tipc_name_table(net); + struct tipc_service *sc; + struct publication *p; if (scope > TIPC_NODE_SCOPE || lower > upper) { - pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n", + pr_debug("Failed to bind illegal {%u,%u,%u} with scope %u\n", type, lower, upper, scope); return NULL; } - - if (!seq) - seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) + sc = tipc_service_create(type, &nt->services[hash(type)]); + if (!sc) return NULL; - spin_lock_bh(&seq->lock); - publ = tipc_nameseq_insert_publ(net, seq, type, lower, upper, - scope, node, port, key); - spin_unlock_bh(&seq->lock); - return publ; + spin_lock_bh(&sc->lock); + p = tipc_service_insert_publ(net, sc, type, lower, upper, + scope, node, port, key); + spin_unlock_bh(&sc->lock); + return p; } struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 port, - u32 key) + u32 lower, u32 node, u32 port, + u32 key) { - struct publication *publ; - struct name_seq *seq = nametbl_find_seq(net, type); + struct tipc_service *sc = tipc_service_find(net, type); + struct publication *p = NULL; - if (!seq) + if (!sc) return NULL; - spin_lock_bh(&seq->lock); - publ = tipc_nameseq_remove_publ(net, seq, lower, node, port, key); - if (!seq->first_free && list_empty(&seq->subscriptions)) { - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - kfree_rcu(seq, rcu); - return publ; + spin_lock_bh(&sc->lock); + p = tipc_service_remove_publ(net, sc, lower, node, port, key); + + /* Delete service item if this no more publications and subscriptions */ + if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { + hlist_del_init_rcu(&sc->service_list); + kfree_rcu(sc, rcu); } - spin_unlock_bh(&seq->lock); - return publ; + spin_unlock_bh(&sc->lock); + return p; } /** - * tipc_nametbl_translate - perform name translation + * tipc_nametbl_translate - perform service instance to socket translation * * On entry, 'destnode' is the search domain used during translation. * @@ -492,7 +405,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, * - if name translation is deferred to another node/cluster/zone, * leaves 'destnode' unchanged (will be non-zero) and returns 0 * - if name translation is attempted and succeeds, sets 'destnode' - * to publishing node and returns port reference (will be non-zero) + * to publication node and returns port reference (will be non-zero) * - if name translation is attempted and fails, sets 'destnode' to 0 * and returns 0 */ @@ -502,10 +415,9 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); - struct sub_seq *sseq; - struct name_info *info; - struct publication *publ; - struct name_seq *seq; + struct service_range *sr; + struct tipc_service *sc; + struct publication *p; u32 port = 0; u32 node = 0; @@ -513,49 +425,49 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, return 0; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (unlikely(!seq)) + sc = tipc_service_find(net, type); + if (unlikely(!sc)) goto not_found; - spin_lock_bh(&seq->lock); - sseq = nameseq_find_subseq(seq, instance); - if (unlikely(!sseq)) + + spin_lock_bh(&sc->lock); + sr = tipc_service_find_range(sc, instance); + if (unlikely(!sr)) goto no_match; - info = sseq->info; /* Closest-First Algorithm */ if (legacy && !*destnode) { - if (!list_empty(&info->local_publ)) { - publ = list_first_entry(&info->local_publ, - struct publication, - local_publ); - list_move_tail(&publ->local_publ, - &info->local_publ); + if (!list_empty(&sr->local_publ)) { + p = list_first_entry(&sr->local_publ, + struct publication, + local_publ); + list_move_tail(&p->local_publ, + &sr->local_publ); } else { - publ = list_first_entry(&info->all_publ, - struct publication, - all_publ); - list_move_tail(&publ->all_publ, - &info->all_publ); + p = list_first_entry(&sr->all_publ, + struct publication, + all_publ); + list_move_tail(&p->all_publ, + &sr->all_publ); } } /* Round-Robin Algorithm */ - else if (*destnode == tipc_own_addr(net)) { - if (list_empty(&info->local_publ)) + else if (*destnode == self) { + if (list_empty(&sr->local_publ)) goto no_match; - publ = list_first_entry(&info->local_publ, struct publication, - local_publ); - list_move_tail(&publ->local_publ, &info->local_publ); + p = list_first_entry(&sr->local_publ, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); } else { - publ = list_first_entry(&info->all_publ, struct publication, - all_publ); - list_move_tail(&publ->all_publ, &info->all_publ); + p = list_first_entry(&sr->all_publ, struct publication, + all_publ); + list_move_tail(&p->all_publ, &sr->all_publ); } - port = publ->port; - node = publ->node; + port = p->port; + node = p->node; no_match: - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); not_found: rcu_read_unlock(); *destnode = node; @@ -567,34 +479,36 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, bool all) { u32 self = tipc_own_addr(net); - struct publication *publ; - struct name_info *info; - struct name_seq *seq; - struct sub_seq *sseq; + struct service_range *sr; + struct tipc_service *sc; + struct publication *p; *dstcnt = 0; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (unlikely(!seq)) + sc = tipc_service_find(net, type); + if (unlikely(!sc)) goto exit; - spin_lock_bh(&seq->lock); - sseq = nameseq_find_subseq(seq, instance); - if (likely(sseq)) { - info = sseq->info; - list_for_each_entry(publ, &info->all_publ, all_publ) { - if (publ->scope != scope) - continue; - if (publ->port == exclude && publ->node == self) - continue; - tipc_dest_push(dsts, publ->node, publ->port); - (*dstcnt)++; - if (all) - continue; - list_move_tail(&publ->all_publ, &info->all_publ); - break; - } + + spin_lock_bh(&sc->lock); + + sr = tipc_service_find_range(sc, instance); + if (!sr) + goto no_match; + + list_for_each_entry(p, &sr->all_publ, all_publ) { + if (p->scope != scope) + continue; + if (p->port == exclude && p->node == self) + continue; + tipc_dest_push(dsts, p->node, p->port); + (*dstcnt)++; + if (all) + continue; + list_move_tail(&p->all_publ, &sr->all_publ); + break; } - spin_unlock_bh(&seq->lock); +no_match: + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); @@ -603,61 +517,64 @@ exit: void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, bool exact, struct list_head *dports) { - struct sub_seq *sseq_stop; - struct name_info *info; + struct service_range *sr; + struct tipc_service *sc; struct publication *p; - struct name_seq *seq; - struct sub_seq *sseq; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); - sseq_stop = seq->sseqs + seq->first_free; - for (; sseq != sseq_stop; sseq++) { - if (sseq->lower > upper) + spin_lock_bh(&sc->lock); + + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper < lower) + continue; + if (sr->lower > upper) break; - info = sseq->info; - list_for_each_entry(p, &info->local_publ, local_publ) { + list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) tipc_dest_push(dports, 0, p->port); } } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes * - Creates list of nodes that overlap the given multicast address - * - Determines if any node local ports overlap + * - Determines if any node local destinations overlap */ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, struct tipc_nlist *nodes) { - struct sub_seq *sseq, *stop; - struct publication *publ; - struct name_info *info; - struct name_seq *seq; + struct service_range *sr; + struct tipc_service *sc; + struct publication *p; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs + nameseq_locate_subseq(seq, lower); - stop = seq->sseqs + seq->first_free; - for (; sseq != stop && sseq->lower <= upper; sseq++) { - info = sseq->info; - list_for_each_entry(publ, &info->all_publ, all_publ) { - tipc_nlist_add(nodes, publ->node); + spin_lock_bh(&sc->lock); + + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper < lower) + continue; + if (sr->lower > upper) + break; + list_for_each_entry(p, &sr->all_publ, all_publ) { + tipc_nlist_add(nodes, p->node); } } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } @@ -667,89 +584,88 @@ exit: void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, u32 type, u32 scope) { - struct sub_seq *sseq, *stop; - struct name_info *info; + struct service_range *sr; + struct tipc_service *sc; struct publication *p; - struct name_seq *seq; + struct rb_node *n; rcu_read_lock(); - seq = nametbl_find_seq(net, type); - if (!seq) + sc = tipc_service_find(net, type); + if (!sc) goto exit; - spin_lock_bh(&seq->lock); - sseq = seq->sseqs; - stop = seq->sseqs + seq->first_free; - for (; sseq != stop; sseq++) { - info = sseq->info; - list_for_each_entry(p, &info->all_publ, all_publ) { + spin_lock_bh(&sc->lock); + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != scope) continue; tipc_group_add_member(grp, p->node, p->port, p->lower); } } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } -/* - * tipc_nametbl_publish - add name publication to network name tables +/* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port_ref, + u32 upper, u32 scope, u32 port, u32 key) { - struct publication *publ; - struct sk_buff *buf = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct publication *p = NULL; + struct sk_buff *skb = NULL; spin_lock_bh(&tn->nametbl_lock); - if (tn->nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) { - pr_warn("Publication failed, local publication limit reached (%u)\n", - TIPC_MAX_PUBLICATIONS); - spin_unlock_bh(&tn->nametbl_lock); - return NULL; + + if (nt->local_publ_count >= TIPC_MAX_PUBL) { + pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); + goto exit; } - publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope, - tipc_own_addr(net), port_ref, key); - if (likely(publ)) { - tn->nametbl->local_publ_count++; - buf = tipc_named_publish(net, publ); + p = tipc_nametbl_insert_publ(net, type, lower, upper, scope, + tipc_own_addr(net), port, key); + if (p) { + nt->local_publ_count++; + skb = tipc_named_publish(net, p); /* Any pending external events? */ tipc_named_process_backlog(net); } +exit: spin_unlock_bh(&tn->nametbl_lock); - if (buf) - tipc_node_broadcast(net, buf); - return publ; + if (skb) + tipc_node_broadcast(net, skb); + return p; } /** - * tipc_nametbl_withdraw - withdraw name publication from network name tables + * tipc_nametbl_withdraw - withdraw a service binding */ -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port, - u32 key) +int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, + u32 port, u32 key) { - struct publication *publ; + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + u32 self = tipc_own_addr(net); struct sk_buff *skb = NULL; - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct publication *p; spin_lock_bh(&tn->nametbl_lock); - publ = tipc_nametbl_remove_publ(net, type, lower, tipc_own_addr(net), - port, key); - if (likely(publ)) { - tn->nametbl->local_publ_count--; - skb = tipc_named_withdraw(net, publ); + + p = tipc_nametbl_remove_publ(net, type, lower, self, port, key); + if (p) { + nt->local_publ_count--; + skb = tipc_named_withdraw(net, p); /* Any pending external events? */ tipc_named_process_backlog(net); - list_del_init(&publ->binding_sock); - kfree_rcu(publ, rcu); + list_del_init(&p->binding_sock); + kfree_rcu(p, rcu); } else { - pr_err("Unable to remove local publication\n" - "(type=%u, lower=%u, port=%u, key=%u)\n", + pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", type, lower, port, key); } spin_unlock_bh(&tn->nametbl_lock); @@ -766,27 +682,24 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port, */ void tipc_nametbl_subscribe(struct tipc_subscription *sub) { + struct name_table *nt = tipc_name_table(sub->net); struct tipc_net *tn = tipc_net(sub->net); struct tipc_subscr *s = &sub->evt.s; u32 type = tipc_sub_read(s, seq.type); - int index = hash(type); - struct name_seq *seq; - struct tipc_name_seq ns; + struct tipc_service *sc; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(sub->net, type); - if (!seq) - seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]); - if (seq) { - spin_lock_bh(&seq->lock); - tipc_nameseq_subscribe(seq, sub); - spin_unlock_bh(&seq->lock); + sc = tipc_service_find(sub->net, type); + if (!sc) + sc = tipc_service_create(type, &nt->services[hash(type)]); + if (sc) { + spin_lock_bh(&sc->lock); + tipc_service_subscribe(sc, sub); + spin_unlock_bh(&sc->lock); } else { - ns.type = tipc_sub_read(s, seq.type); - ns.lower = tipc_sub_read(s, seq.lower); - ns.upper = tipc_sub_read(s, seq.upper); - pr_warn("Failed to create subscription for {%u,%u,%u}\n", - ns.type, ns.lower, ns.upper); + pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, + tipc_sub_read(s, seq.lower), + tipc_sub_read(s, seq.upper)); } spin_unlock_bh(&tn->nametbl_lock); } @@ -796,124 +709,122 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub) */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { - struct tipc_subscr *s = &sub->evt.s; struct tipc_net *tn = tipc_net(sub->net); - struct name_seq *seq; + struct tipc_subscr *s = &sub->evt.s; u32 type = tipc_sub_read(s, seq.type); + struct tipc_service *sc; spin_lock_bh(&tn->nametbl_lock); - seq = nametbl_find_seq(sub->net, type); - if (seq != NULL) { - spin_lock_bh(&seq->lock); - list_del_init(&sub->nameseq_list); - tipc_sub_put(sub); - if (!seq->first_free && list_empty(&seq->subscriptions)) { - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - kfree_rcu(seq, rcu); - } else { - spin_unlock_bh(&seq->lock); - } + sc = tipc_service_find(sub->net, type); + if (!sc) + goto exit; + + spin_lock_bh(&sc->lock); + list_del_init(&sub->service_list); + tipc_sub_put(sub); + + /* Delete service item if no more publications and subscriptions */ + if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { + hlist_del_init_rcu(&sc->service_list); + kfree_rcu(sc, rcu); } + spin_unlock_bh(&sc->lock); +exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct name_table *tipc_nametbl; + struct tipc_net *tn = tipc_net(net); + struct name_table *nt; int i; - tipc_nametbl = kzalloc(sizeof(*tipc_nametbl), GFP_ATOMIC); - if (!tipc_nametbl) + nt = kzalloc(sizeof(*nt), GFP_ATOMIC); + if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) - INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]); + INIT_HLIST_HEAD(&nt->services[i]); - INIT_LIST_HEAD(&tipc_nametbl->node_scope); - INIT_LIST_HEAD(&tipc_nametbl->cluster_scope); - tn->nametbl = tipc_nametbl; + INIT_LIST_HEAD(&nt->node_scope); + INIT_LIST_HEAD(&nt->cluster_scope); + tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** - * tipc_purge_publications - remove all publications for a given type - * - * tipc_nametbl_lock must be held when calling this function + * tipc_service_delete - purge all publications for a service and delete it */ -static void tipc_purge_publications(struct net *net, struct name_seq *seq) +static void tipc_service_delete(struct net *net, struct tipc_service *sc) { - struct publication *publ, *safe; - struct sub_seq *sseq; - struct name_info *info; - - spin_lock_bh(&seq->lock); - sseq = seq->sseqs; - info = sseq->info; - list_for_each_entry_safe(publ, safe, &info->all_publ, all_publ) { - tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node, - publ->port, publ->key); - kfree_rcu(publ, rcu); + struct service_range *sr, *tmpr; + struct publication *p, *tmpb; + + spin_lock_bh(&sc->lock); + rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { + list_for_each_entry_safe(p, tmpb, + &sr->all_publ, all_publ) { + tipc_service_remove_publ(net, sc, p->lower, p->node, + p->port, p->key); + kfree_rcu(p, rcu); + } } - hlist_del_init_rcu(&seq->ns_list); - kfree(seq->sseqs); - spin_unlock_bh(&seq->lock); - - kfree_rcu(seq, rcu); + hlist_del_init_rcu(&sc->service_list); + spin_unlock_bh(&sc->lock); + kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { + struct name_table *nt = tipc_name_table(net); + struct tipc_net *tn = tipc_net(net); + struct hlist_head *service_head; + struct tipc_service *service; u32 i; - struct name_seq *seq; - struct hlist_head *seq_head; - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct name_table *tipc_nametbl = tn->nametbl; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { - if (hlist_empty(&tipc_nametbl->seq_hlist[i])) + if (hlist_empty(&nt->services[i])) continue; - seq_head = &tipc_nametbl->seq_hlist[i]; - hlist_for_each_entry_rcu(seq, seq_head, ns_list) { - tipc_purge_publications(net, seq); + service_head = &nt->services[i]; + hlist_for_each_entry_rcu(service, service_head, service_list) { + tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); synchronize_net(); - kfree(tipc_nametbl); - + kfree(nt); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, - struct name_seq *seq, - struct sub_seq *sseq, u32 *last_publ) + struct tipc_service *service, + struct service_range *sr, + u32 *last_key) { - void *hdr; - struct nlattr *attrs; - struct nlattr *publ; struct publication *p; + struct nlattr *attrs; + struct nlattr *b; + void *hdr; - if (*last_publ) { - list_for_each_entry(p, &sseq->info->all_publ, all_publ) - if (p->key == *last_publ) + if (*last_key) { + list_for_each_entry(p, &sr->all_publ, all_publ) + if (p->key == *last_key) break; - if (p->key != *last_publ) + if (p->key != *last_key) return -EPIPE; } else { - p = list_first_entry(&sseq->info->all_publ, struct publication, + p = list_first_entry(&sr->all_publ, + struct publication, all_publ); } - list_for_each_entry_from(p, &sseq->info->all_publ, all_publ) { - *last_publ = p->key; + list_for_each_entry_from(p, &sr->all_publ, all_publ) { + *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, @@ -925,15 +836,15 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, if (!attrs) goto msg_full; - publ = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); - if (!publ) + b = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); + if (!b) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, seq->type)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sseq->lower)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sseq->upper)) + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; @@ -944,16 +855,16 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; - nla_nest_end(msg->skb, publ); + nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } - *last_publ = 0; + *last_key = 0; return 0; publ_msg_full: - nla_nest_cancel(msg->skb, publ); + nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: @@ -962,39 +873,34 @@ msg_full: return -EMSGSIZE; } -static int __tipc_nl_subseq_list(struct tipc_nl_msg *msg, struct name_seq *seq, - u32 *last_lower, u32 *last_publ) +static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, + struct tipc_service *sc, + u32 *last_lower, u32 *last_key) { - struct sub_seq *sseq; - struct sub_seq *sseq_start; + struct service_range *sr; + struct rb_node *n; int err; - if (*last_lower) { - sseq_start = nameseq_find_subseq(seq, *last_lower); - if (!sseq_start) - return -EPIPE; - } else { - sseq_start = seq->sseqs; - } - - for (sseq = sseq_start; sseq != &seq->sseqs[seq->first_free]; sseq++) { - err = __tipc_nl_add_nametable_publ(msg, seq, sseq, last_publ); + for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->lower < *last_lower) + continue; + err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { - *last_lower = sseq->lower; + *last_lower = sr->lower; return err; } } *last_lower = 0; - return 0; } -static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, - u32 *last_type, u32 *last_lower, u32 *last_publ) +static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, + u32 *last_type, u32 *last_lower, u32 *last_key) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct hlist_head *seq_head; - struct name_seq *seq = NULL; + struct tipc_net *tn = tipc_net(net); + struct tipc_service *service = NULL; + struct hlist_head *head; int err; int i; @@ -1004,30 +910,31 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { - seq_head = &tn->nametbl->seq_hlist[i]; + head = &tn->nametbl->services[i]; if (*last_type) { - seq = nametbl_find_seq(net, *last_type); - if (!seq) + service = tipc_service_find(net, *last_type); + if (!service) return -EPIPE; } else { - hlist_for_each_entry_rcu(seq, seq_head, ns_list) + hlist_for_each_entry_rcu(service, head, service_list) break; - if (!seq) + if (!service) continue; } - hlist_for_each_entry_from_rcu(seq, ns_list) { - spin_lock_bh(&seq->lock); - err = __tipc_nl_subseq_list(msg, seq, last_lower, - last_publ); + hlist_for_each_entry_from_rcu(service, service_list) { + spin_lock_bh(&service->lock); + err = __tipc_nl_service_range_list(msg, service, + last_lower, + last_key); if (err) { - *last_type = seq->type; - spin_unlock_bh(&seq->lock); + *last_type = service->type; + spin_unlock_bh(&service->lock); return err; } - spin_unlock_bh(&seq->lock); + spin_unlock_bh(&service->lock); } *last_type = 0; } @@ -1036,13 +943,13 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { - int err; - int done = cb->args[3]; + struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; - u32 last_publ = cb->args[2]; - struct net *net = sock_net(skb->sk); + u32 last_key = cb->args[2]; + int done = cb->args[3]; struct tipc_nl_msg msg; + int err; if (done) return 0; @@ -1052,7 +959,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); - err = tipc_nl_seq_list(net, &msg, &last_type, &last_lower, &last_publ); + err = tipc_nl_service_list(net, &msg, &last_type, + &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { @@ -1068,7 +976,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0] = last_type; cb->args[1] = last_lower; - cb->args[2] = last_publ; + cb->args[2] = last_key; cb->args[3] = done; return skb->len; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 34a4ccb907aa..1b03b8751707 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -97,7 +97,7 @@ struct publication { * @local_publ_count: number of publications issued by this node */ struct name_table { - struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE]; + struct hlist_head services[TIPC_NAMETBL_SIZE]; struct list_head node_scope; struct list_head cluster_scope; u32 local_publ_count; diff --git a/net/tipc/node.c b/net/tipc/node.c index 4fb4327311bb..85e777e00ec9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -324,12 +324,12 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, addr, bearer_id); tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr, - TIPC_NODE_SCOPE, link_id, addr); + TIPC_NODE_SCOPE, link_id, link_id); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, - link_id, addr); + link_id, link_id); } } diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index 8b2d22b18f22..d793b4343885 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -40,7 +40,7 @@ #include "topsrv.h" #define TIPC_MAX_SUBSCR 65535 -#define TIPC_MAX_PUBLICATIONS 65535 +#define TIPC_MAX_PUBL 65535 struct tipc_subscription; struct tipc_conn; @@ -58,7 +58,7 @@ struct tipc_subscription { struct kref kref; struct net *net; struct timer_list timer; - struct list_head nameseq_list; + struct list_head service_list; struct list_head sub_list; struct tipc_event evt; int conid; -- cgit v1.2.3-70-g09d2 From f20889f72bd531cad88fbb571755a52cabf43424 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:42 +0200 Subject: tipc: refactor name table translate function The function tipc_nametbl_translate() function is ugly and hard to follow. This can be improved somewhat by introducing a stack variable for holding the publication list to be used and re-ordering the if- clauses for selection of algorithm. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_table.c | 61 +++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e06c7a8907aa..4bdc580c533b 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -399,29 +399,32 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, /** * tipc_nametbl_translate - perform service instance to socket translation * - * On entry, 'destnode' is the search domain used during translation. + * On entry, 'dnode' is the search domain used during translation. * * On exit: - * - if name translation is deferred to another node/cluster/zone, - * leaves 'destnode' unchanged (will be non-zero) and returns 0 - * - if name translation is attempted and succeeds, sets 'destnode' - * to publication node and returns port reference (will be non-zero) - * - if name translation is attempted and fails, sets 'destnode' to 0 - * and returns 0 + * - if translation is deferred to another node, leave 'dnode' unchanged and + * return 0 + * - if translation is attempted and succeeds, set 'dnode' to the publishing + * node and return the published (non-zero) port number + * - if translation is attempted and fails, set 'dnode' to 0 and return 0 + * + * Note that for legacy users (node configured with Z.C.N address format) the + * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0 + * we must look in the local binding list first */ -u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, - u32 *destnode) +u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); struct service_range *sr; struct tipc_service *sc; + struct list_head *list; struct publication *p; u32 port = 0; u32 node = 0; - if (!tipc_in_scope(legacy, *destnode, self)) + if (!tipc_in_scope(legacy, *dnode, self)) return 0; rcu_read_lock(); @@ -434,43 +437,29 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, if (unlikely(!sr)) goto no_match; - /* Closest-First Algorithm */ - if (legacy && !*destnode) { - if (!list_empty(&sr->local_publ)) { - p = list_first_entry(&sr->local_publ, - struct publication, - local_publ); - list_move_tail(&p->local_publ, - &sr->local_publ); - } else { - p = list_first_entry(&sr->all_publ, - struct publication, - all_publ); - list_move_tail(&p->all_publ, - &sr->all_publ); - } - } - - /* Round-Robin Algorithm */ - else if (*destnode == self) { - if (list_empty(&sr->local_publ)) + /* Select lookup algorithm: local, closest-first or round-robin */ + if (*dnode == self) { + list = &sr->local_publ; + if (list_empty(list)) goto no_match; - p = list_first_entry(&sr->local_publ, struct publication, - local_publ); + p = list_first_entry(list, struct publication, local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { + list = &sr->local_publ; + p = list_first_entry(list, struct publication, local_publ); list_move_tail(&p->local_publ, &sr->local_publ); } else { - p = list_first_entry(&sr->all_publ, struct publication, - all_publ); + list = &sr->all_publ; + p = list_first_entry(list, struct publication, all_publ); list_move_tail(&p->all_publ, &sr->all_publ); } - port = p->port; node = p->node; no_match: spin_unlock_bh(&sc->lock); not_found: rcu_read_unlock(); - *destnode = node; + *dnode = node; return port; } -- cgit v1.2.3-70-g09d2 From 37922ea4a3105176357c8d565a9d982c4a08714a Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:43 +0200 Subject: tipc: permit overlapping service ranges in name table With the new RB tree structure for service ranges it becomes possible to solve an old problem; - we can now allow overlapping service ranges in the table. When inserting a new service range to the tree, we use 'lower' as primary key, and when necessary 'upper' as secondary key. Since there may now be multiple service ranges matching an indicated 'lower' value, we must also add the 'upper' value to the functions used for removing publications, so that the correct, corresponding range item can be found. These changes guarantee that a well-formed publication/withdrawal item from a peer node never will be rejected, and make it possible to eliminate the problematic backlog functionality we currently have for handling such cases. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/name_distr.c | 90 +++++++++++++-------------------------------------- net/tipc/name_distr.h | 1 - net/tipc/name_table.c | 64 +++++++++++++++++------------------- net/tipc/name_table.h | 8 ++--- net/tipc/net.c | 2 +- net/tipc/node.c | 2 +- net/tipc/socket.c | 4 +-- 7 files changed, 60 insertions(+), 111 deletions(-) (limited to 'net') diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 8240a85b0d0c..51b4b96f89db 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -204,12 +204,12 @@ void tipc_named_node_up(struct net *net, u32 dnode) */ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) { - struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_net *tn = tipc_net(net); struct publication *p; spin_lock_bh(&tn->nametbl_lock); - p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, - publ->node, publ->port, publ->key); + p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper, + publ->node, publ->key); if (p) tipc_node_unsubscribe(net, &p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); @@ -261,81 +261,37 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { - struct publication *publ = NULL; + struct publication *p = NULL; + u32 lower = ntohl(i->lower); + u32 upper = ntohl(i->upper); + u32 type = ntohl(i->type); + u32 port = ntohl(i->port); + u32 key = ntohl(i->key); if (dtype == PUBLICATION) { - publ = tipc_nametbl_insert_publ(net, ntohl(i->type), - ntohl(i->lower), - ntohl(i->upper), - TIPC_CLUSTER_SCOPE, node, - ntohl(i->port), ntohl(i->key)); - if (publ) { - tipc_node_subscribe(net, &publ->binding_node, node); + p = tipc_nametbl_insert_publ(net, type, lower, upper, + TIPC_CLUSTER_SCOPE, node, + port, key); + if (p) { + tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { - publ = tipc_nametbl_remove_publ(net, ntohl(i->type), - ntohl(i->lower), - node, ntohl(i->port), - ntohl(i->key)); - if (publ) { - tipc_node_unsubscribe(net, &publ->binding_node, node); - kfree_rcu(publ, rcu); + p = tipc_nametbl_remove_publ(net, type, lower, + upper, node, key); + if (p) { + tipc_node_unsubscribe(net, &p->binding_node, node); + kfree_rcu(p, rcu); return true; } + pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n", + type, lower, node); } else { pr_warn("Unrecognized name table message received\n"); } return false; } -/** - * tipc_named_add_backlog - add a failed name table update to the backlog - * - */ -static void tipc_named_add_backlog(struct net *net, struct distr_item *i, - u32 type, u32 node) -{ - struct distr_queue_item *e; - struct tipc_net *tn = net_generic(net, tipc_net_id); - unsigned long now = get_jiffies_64(); - - e = kzalloc(sizeof(*e), GFP_ATOMIC); - if (!e) - return; - e->dtype = type; - e->node = node; - e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); - memcpy(e, i, sizeof(*i)); - list_add_tail(&e->next, &tn->dist_queue); -} - -/** - * tipc_named_process_backlog - try to process any pending name table updates - * from the network. - */ -void tipc_named_process_backlog(struct net *net) -{ - struct distr_queue_item *e, *tmp; - struct tipc_net *tn = net_generic(net, tipc_net_id); - unsigned long now = get_jiffies_64(); - - list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { - if (time_after(e->expires, now)) { - if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) - continue; - } else { - pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %x key=%u\n", - e->dtype, ntohl(e->i.type), - ntohl(e->i.lower), - ntohl(e->i.upper), - e->node, ntohl(e->i.key)); - } - list_del(&e->next); - kfree(e); - } -} - /** * tipc_named_rcv - process name table update messages sent by another node */ @@ -358,12 +314,10 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) count = msg_data_sz(msg) / ITEM_SIZE; node = msg_orignode(msg); while (count--) { - if (!tipc_update_nametbl(net, item, node, mtype)) - tipc_named_add_backlog(net, item, mtype, node); + tipc_update_nametbl(net, item, node, mtype); item++; } kfree_skb(skb); - tipc_named_process_backlog(net); } spin_unlock_bh(&tn->nametbl_lock); } diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h index 4753e628d7c4..63fc73e0fa6c 100644 --- a/net/tipc/name_distr.h +++ b/net/tipc/name_distr.h @@ -72,7 +72,6 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ); void tipc_named_node_up(struct net *net, u32 dnode); void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue); void tipc_named_reinit(struct net *net); -void tipc_named_process_backlog(struct net *net); void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr); #endif diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 4bdc580c533b..b1fe20972aa9 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -171,10 +171,14 @@ static struct service_range *tipc_service_create_range(struct tipc_service *sc, tmp = container_of(parent, struct service_range, tree_node); if (lower < tmp->lower) n = &(*n)->rb_left; + else if (lower > tmp->lower) + n = &(*n)->rb_right; + else if (upper < tmp->upper) + n = &(*n)->rb_left; else if (upper > tmp->upper) n = &(*n)->rb_right; else - return NULL; + return tmp; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) @@ -200,17 +204,11 @@ static struct publication *tipc_service_insert_publ(struct net *net, struct publication *p; bool first = false; - sr = tipc_service_find_range(sc, lower); - if (!sr) { - sr = tipc_service_create_range(sc, lower, upper); - if (!sr) - goto err; - first = true; - } + sr = tipc_service_create_range(sc, lower, upper); + if (!sr) + goto err; - /* Lower end overlaps existing entry, but we need an exact match */ - if (sr->lower != lower || sr->upper != upper) - return NULL; + first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(p, &sr->all_publ, all_publ) { @@ -239,30 +237,32 @@ err: /** * tipc_service_remove_publ - remove a publication from a service - * - * NOTE: There may be cases where TIPC is asked to remove a publication - * that is not in the name table. For example, if another node issues a - * publication for a name range that overlaps an existing name range - * the publication will not be recorded, which means the publication won't - * be found when the name range is later withdrawn by that node. - * A failed withdraw request simply returns a failure indication and lets the - * caller issue any error or warning messages associated with such a problem. */ static struct publication *tipc_service_remove_publ(struct net *net, struct tipc_service *sc, - u32 inst, u32 node, - u32 port, u32 key) + u32 lower, u32 upper, + u32 node, u32 key) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *p; bool found = false; bool last = false; + struct rb_node *n; - sr = tipc_service_find_range(sc, inst); + sr = tipc_service_find_range(sc, lower); if (!sr) return NULL; + /* Find exact matching service range */ + for (n = &sr->tree_node; n; n = rb_next(n)) { + sr = container_of(n, struct service_range, tree_node); + if (sr->upper == upper) + break; + } + if (!n || sr->lower != lower || sr->upper != upper) + return NULL; + /* Find publication, if it exists */ list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->key != key || (node && node != p->node)) @@ -375,8 +375,8 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, } struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 port, - u32 key) + u32 lower, u32 upper, + u32 node, u32 key) { struct tipc_service *sc = tipc_service_find(net, type); struct publication *p = NULL; @@ -385,7 +385,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, return NULL; spin_lock_bh(&sc->lock); - p = tipc_service_remove_publ(net, sc, lower, node, port, key); + p = tipc_service_remove_publ(net, sc, lower, upper, node, key); /* Delete service item if this no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { @@ -620,8 +620,6 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); - /* Any pending external events? */ - tipc_named_process_backlog(net); } exit: spin_unlock_bh(&tn->nametbl_lock); @@ -635,7 +633,7 @@ exit: * tipc_nametbl_withdraw - withdraw a service binding */ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, - u32 port, u32 key) + u32 upper, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); @@ -645,17 +643,15 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, spin_lock_bh(&tn->nametbl_lock); - p = tipc_nametbl_remove_publ(net, type, lower, self, port, key); + p = tipc_nametbl_remove_publ(net, type, lower, upper, self, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); - /* Any pending external events? */ - tipc_named_process_backlog(net); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } else { pr_err("Failed to remove local publication {%u,%u,%u}/%u\n", - type, lower, port, key); + type, lower, upper, key); } spin_unlock_bh(&tn->nametbl_lock); @@ -754,8 +750,8 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmpb, &sr->all_publ, all_publ) { - tipc_service_remove_publ(net, sc, p->lower, p->node, - p->port, p->key); + tipc_service_remove_publ(net, sc, p->lower, p->upper, + p->node, p->key); kfree_rcu(p, rcu); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 1b03b8751707..4b14fc28d9e2 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -116,16 +116,16 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, struct list_head *dsts, int *dstcnt, u32 exclude, bool all); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, - u32 upper, u32 scope, u32 port_ref, + u32 upper, u32 scope, u32 port, u32 key); -int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref, +int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper, u32 key); struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 node, u32 ref, u32 key); struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, - u32 lower, u32 node, u32 ref, - u32 key); + u32 lower, u32 upper, + u32 node, u32 key); void tipc_nametbl_subscribe(struct tipc_subscription *s); void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); diff --git a/net/tipc/net.c b/net/tipc/net.c index 29538dc00857..856f9e97ea29 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -136,7 +136,7 @@ void tipc_net_stop(struct net *net) if (!self) return; - tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, 0, self); + tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, self, self); rtnl_lock(); tipc_bearer_stop(net); tipc_node_stop(net); diff --git a/net/tipc/node.c b/net/tipc/node.c index 85e777e00ec9..c77dd2f3c589 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -329,7 +329,7 @@ static void tipc_node_write_unlock(struct tipc_node *n) if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, addr, bearer_id); tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, - link_id, link_id); + addr, link_id); } } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 275b666f6231..3e5eba30865e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2634,12 +2634,12 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, if (publ->upper != seq->upper) break; tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->port, publ->key); + publ->upper, publ->key); rc = 0; break; } tipc_nametbl_withdraw(net, publ->type, publ->lower, - publ->port, publ->key); + publ->upper, publ->key); rc = 0; } if (list_empty(&tsk->publications)) -- cgit v1.2.3-70-g09d2 From 7494cfa6d36d1556f17baa012dd93833620783db Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Thu, 29 Mar 2018 23:20:45 +0200 Subject: tipc: avoid possible string overflow gcc points out that the combined length of the fixed-length inputs to l->name is larger than the destination buffer size: net/tipc/link.c: In function 'tipc_link_create': net/tipc/link.c:465:26: error: '%s' directive writing up to 32 bytes into a region of size between 26 and 58 [-Werror=format-overflow=] sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); net/tipc/link.c:465:2: note: 'sprintf' output 11 or more bytes (assuming 75) into a destination of size 60 sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); A detailed analysis reveals that the theoretical maximum length of a link name is: max self_str + 1 + max if_name + 1 + max peer_str + 1 + max if_name = 16 + 1 + 15 + 1 + 16 + 1 + 15 = 65 Since we also need space for a trailing zero we now set MAX_LINK_NAME to 68. Just to be on the safe side we also replace the sprintf() call with snprintf(). Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Reported-by: Arnd Bergmann Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 +- net/tipc/link.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 156224ac3d74..bf6d28677cfe 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -216,7 +216,7 @@ struct tipc_group_req { #define TIPC_MAX_MEDIA_NAME 16 #define TIPC_MAX_IF_NAME 16 #define TIPC_MAX_BEARER_NAME 32 -#define TIPC_MAX_LINK_NAME 60 +#define TIPC_MAX_LINK_NAME 68 #define SIOCGETLINKNAME SIOCPROTOPRIVATE diff --git a/net/tipc/link.c b/net/tipc/link.c index 8f2a9496439b..695acb783969 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -462,7 +462,8 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ - sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str); + snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown", + self_str, if_name, peer_str); strcpy(l->if_name, if_name); l->addr = peer; -- cgit v1.2.3-70-g09d2 From fc1dd36992bb041b4470120aecf8986910c56088 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 30 Mar 2018 19:38:27 +0300 Subject: net: Remove net_rwsem from {, un}register_netdevice_notifier() These functions take net_rwsem, while wireless_nlevent_flush() also takes it. But down_read() can't be taken recursive, because of rw_semaphore design, which prevents it to be occupied by only readers forever. Since we take pernet_ops_rwsem in {,un}register_netdevice_notifier(), net list can't change, so these down_read()/up_read() can be removed. Fixes: f0b07bb151b0 "net: Introduce net_rwsem to protect net_namespace_list" Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/core/dev.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 07da7add4845..8edb58829124 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1633,7 +1633,6 @@ int register_netdevice_notifier(struct notifier_block *nb) goto unlock; if (dev_boot_phase) goto unlock; - down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); @@ -1647,7 +1646,6 @@ int register_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UP, dev); } } - up_read(&net_rwsem); unlock: rtnl_unlock(); @@ -1671,7 +1669,6 @@ rollback: } outroll: - up_read(&net_rwsem); raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } @@ -1704,7 +1701,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - down_read(&net_rwsem); for_each_net(net) { for_each_netdev(net, dev) { if (dev->flags & IFF_UP) { @@ -1715,7 +1711,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb) call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); } } - up_read(&net_rwsem); unlock: rtnl_unlock(); up_write(&pernet_ops_rwsem); -- cgit v1.2.3-70-g09d2 From 554873e517115c4b7207259f1cadfd77d90b5395 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 30 Mar 2018 19:38:37 +0300 Subject: net: Do not take net_rwsem in __rtnl_link_unregister() This function calls call_netdevice_notifier(), which also may take net_rwsem. So, we can't use net_rwsem here. This patch makes callers of this functions take pernet_ops_rwsem, like register_netdevice_notifier() does. This will protect the modifications of net_namespace_list, and allows notifiers to take it (they won't have to care about context). Since __rtnl_link_unregister() is used on module load and unload (which are not frequent operations), this looks for me better, than make all call_netdevice_notifier() always executing in "protected net_namespace_list" context. Also, this fixes the problem we had a deal in 328fbe747ad4 "Close race between {un, }register_netdevice_notifier and ...", and guarantees __rtnl_link_unregister() does not skip exitting net. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- drivers/net/dummy.c | 2 ++ drivers/net/ifb.c | 2 ++ net/core/net_namespace.c | 1 + net/core/rtnetlink.c | 6 +++--- 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index 30b1c8512049..0d15a12a4560 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -219,6 +219,7 @@ static int __init dummy_init_module(void) { int i, err = 0; + down_write(&pernet_ops_rwsem); rtnl_lock(); err = __rtnl_link_register(&dummy_link_ops); if (err < 0) @@ -233,6 +234,7 @@ static int __init dummy_init_module(void) out: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 0008da7e9d4c..5f2897ec0edc 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -330,6 +330,7 @@ static int __init ifb_init_module(void) { int i, err; + down_write(&pernet_ops_rwsem); rtnl_lock(); err = __rtnl_link_register(&ifb_link_ops); if (err < 0) @@ -344,6 +345,7 @@ static int __init ifb_init_module(void) out: rtnl_unlock(); + up_write(&pernet_ops_rwsem); return err; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 7fdf321d4997..a11e03f920d3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -51,6 +51,7 @@ static bool init_net_initialized; * outside. */ DECLARE_RWSEM(pernet_ops_rwsem); +EXPORT_SYMBOL_GPL(pernet_ops_rwsem); #define MIN_PERNET_OPS_ID \ ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *)) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e86b28482ca7..45936922d7e2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -412,17 +412,17 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister * - * The caller must hold the rtnl_mutex. + * The caller must hold the rtnl_mutex and guarantee net_namespace_list + * integrity (hold pernet_ops_rwsem for writing to close the race + * with setup_net() and cleanup_net()). */ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - down_read(&net_rwsem); for_each_net(net) { __rtnl_kill_links(net, ops); } - up_read(&net_rwsem); list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); -- cgit v1.2.3-70-g09d2 From eeb0a2a526d816253705977033e98fdfbab32387 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 31 Mar 2018 06:11:41 +0000 Subject: vlan: vlan_hw_filter_capable() can be static Fixes the following sparse warning: net/8021q/vlan_core.c:168:6: warning: symbol 'vlan_hw_filter_capable' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index c8d7abdc0463..4f60e86f4b8d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -165,7 +165,7 @@ struct vlan_vid_info { int refcount; }; -bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto) +static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto) { if (proto == htons(ETH_P_8021Q) && dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) -- cgit v1.2.3-70-g09d2 From 787bea7748a76130566f881c2342a0be4127d182 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:43 -0700 Subject: inet: frags: change inet_frags_init_net() return value We will soon initialize one rhashtable per struct netns_frags in inet_frags_init_net(). This patch changes the return value to eventually propagate an error. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 3 ++- net/ieee802154/6lowpan/reassembly.c | 11 ++++++++--- net/ipv4/ip_fragment.c | 12 +++++++++--- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++++++--- net/ipv6/reassembly.c | 11 +++++++++-- 5 files changed, 37 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 351f0c3cdcd9..b1d62176f3b4 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -104,9 +104,10 @@ struct inet_frags { int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); -static inline void inet_frags_init_net(struct netns_frags *nf) +static inline int inet_frags_init_net(struct netns_frags *nf) { atomic_set(&nf->mem, 0); + return 0; } void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 85bf86ad6b18..2aaab4bba429 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -581,14 +581,19 @@ static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + int res; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&ieee802154_lowpan->frags); - - return lowpan_frags_ns_sysctl_register(net); + res = inet_frags_init_net(&ieee802154_lowpan->frags); + if (res < 0) + return res; + res = lowpan_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + return res; } static void __net_exit lowpan_frags_exit_net(struct net *net) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..e0b39d4ecbd4 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.max_dist = 64; - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + res = inet_frags_init_net(&net->ipv4.frags); + if (res < 0) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b84ce3e6d728..6ff41569134a 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -629,12 +629,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { + int res; + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->nf_frag.frags); - - return nf_ct_frag6_sysctl_register(net); + res = inet_frags_init_net(&net->nf_frag.frags); + if (res < 0) + return res; + res = nf_ct_frag6_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + return res; } static void nf_ct_net_exit(struct net *net) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 08a139f14d0f..a8f7a5f0251a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -711,13 +711,20 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { + int res; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->ipv6.frags); + res = inet_frags_init_net(&net->ipv6.frags); + if (res < 0) + return res; - return ip6_frags_ns_sysctl_register(net); + res = ip6_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) -- cgit v1.2.3-70-g09d2 From 093ba72914b696521e4885756a68a3332782c8de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:44 -0700 Subject: inet: frags: add a pointer to struct netns_frags In order to simplify the API, add a pointer to struct inet_frags. This will allow us to make things less complex. These functions no longer have a struct inet_frags parameter : inet_frag_destroy(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */) inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */) ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 11 ++++++----- include/net/ipv6.h | 3 +-- net/ieee802154/6lowpan/reassembly.c | 13 +++++++------ net/ipv4/inet_fragment.c | 17 ++++++++++------- net/ipv4/ip_fragment.c | 9 +++++---- net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++------- net/ipv6/reassembly.c | 20 ++++++++++---------- 7 files changed, 48 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index b1d62176f3b4..69e531ed8189 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -10,6 +10,7 @@ struct netns_frags { int high_thresh; int low_thresh; int max_dist; + struct inet_frags *f; }; /** @@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf) atomic_set(&nf->mem, 0); return 0; } -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); +void inet_frags_exit_net(struct netns_frags *nf); -void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); +void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, struct inet_frags *f, void *key, unsigned int hash); void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); -static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) +static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f); + inet_frag_destroy(q); } static inline bool inet_frag_evicting(struct inet_frag_queue *q) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5c18836672e9..57b7fe43d2ab 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -607,8 +607,7 @@ struct frag_queue { u8 ecn; }; -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags); +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); static inline bool ipv6_addr_any(const struct in6_addr *a) { diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2aaab4bba429..6badc055555b 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * @@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); /* Make the one we just received the head. */ if (prev) { @@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); return ret; } @@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net *net) ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; + ieee802154_lowpan->frags.f = &lowpan_frags; res = inet_frags_init_net(&ieee802154_lowpan->frags); if (res < 0) return res; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); return res; } @@ -602,7 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); } static struct pernet_operations lowpan_frags_ops = { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e8ec28999f5c..1ac69f65d0de 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f) } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +void inet_frags_exit_net(struct netns_frags *nf) { + struct inet_frags *f =nf->f; unsigned int seq; int i; @@ -264,33 +265,34 @@ __acquires(hb->chain_lock) return hb; } -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +static inline void fq_unlink(struct inet_frag_queue *fq) { struct inet_frag_bucket *hb; - hb = get_frag_bucket_locked(fq, f); + hb = get_frag_bucket_locked(fq, fq->net->f); hlist_del(&fq->list); fq->flags |= INET_FRAG_COMPLETE; spin_unlock(&hb->chain_lock); } -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); + fq_unlink(fq); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + struct inet_frags *f; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) /* Release all fragment data. */ fp = q->fragments; nf = q->net; + f = nf->f; while (fp) { struct sk_buff *xp = fp->next; @@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, refcount_inc(&qp->refcnt); spin_unlock(&hb->chain_lock); qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); + inet_frag_put(qp_in); return qp; } } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index e0b39d4ecbd4..cd2b4c9419fc 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q, &ip4_frags); + inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, @@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq) */ static void ipq_kill(struct ipq *ipq) { - inet_frag_kill(&ipq->q, &ip4_frags); + inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) @@ -872,20 +872,21 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.max_dist = 64; + net->ipv4.frags.f = &ip4_frags; res = inet_frags_init_net(&net->ipv4.frags); if (res < 0) return res; res = ip4_frags_ns_ctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); } static struct pernet_operations ip4_frags_ops = { diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6ff41569134a..c4b40fdee838 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -178,7 +178,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq, &nf_frags); + ip6_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { @@ -357,7 +357,7 @@ found: return 0; discard_fq: - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); err: return -EINVAL; } @@ -379,7 +379,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic int payload_len; u8 ecn; - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); WARN_ON(head == NULL); WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); @@ -622,7 +622,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) out_unlock: spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q, &nf_frags); + inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); @@ -634,19 +634,21 @@ static int nf_ct_net_init(struct net *net) net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; + net->nf_frag.frags.f = &nf_frags; + res = inet_frags_init_net(&net->nf_frag.frags); if (res < 0) return res; res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); return res; } static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); } static struct pernet_operations nf_ct_net_ops = { diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a8f7a5f0251a..4855de6f673a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a) } EXPORT_SYMBOL(ip6_frag_init); -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; @@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, frags); + inet_frag_kill(&fq->q); rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); @@ -166,7 +165,7 @@ out_rcu_unlock: rcu_read_unlock(); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, frags); + inet_frag_put(&fq->q); } EXPORT_SYMBOL(ip6_expire_frag_queue); @@ -179,7 +178,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq, &ip6_frags); + ip6_expire_frag_queue(net, fq); } static struct frag_queue * @@ -364,7 +363,7 @@ found: return -1; discard_fq: - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); @@ -391,7 +390,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int sum_truesize; u8 ecn; - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -569,7 +568,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &ip6_frags); + inet_frag_put(&fq->q); return ret; } @@ -716,6 +715,7 @@ static int __net_init ipv6_frags_init_net(struct net *net) net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + net->ipv6.frags.f = &ip6_frags; res = inet_frags_init_net(&net->ipv6.frags); if (res < 0) @@ -723,14 +723,14 @@ static int __net_init ipv6_frags_init_net(struct net *net) res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); } static struct pernet_operations ip6_frags_ops = { -- cgit v1.2.3-70-g09d2 From 5b975bab23615cd0fdf67af6c9298eb01c4b9f61 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:45 -0700 Subject: inet: frags: refactor ipv6_frag_init() We want to call inet_frags_init() earlier. This is a prereq to "inet: frags: use rhashtables for reassembly units" Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 4855de6f673a..f0071b113a92 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -742,10 +742,21 @@ int __init ipv6_frag_init(void) { int ret; - ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + ip6_frags.hashfn = ip6_hashfn; + ip6_frags.constructor = ip6_frag_init; + ip6_frags.destructor = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.match = ip6_frag_match; + ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.frags_cache_name = ip6_frag_cache_name; + ret = inet_frags_init(&ip6_frags); if (ret) goto out; + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto err_protocol; + ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; @@ -754,16 +765,6 @@ int __init ipv6_frag_init(void) if (ret) goto err_pernet; - ip6_frags.hashfn = ip6_hashfn; - ip6_frags.constructor = ip6_frag_init; - ip6_frags.destructor = NULL; - ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; - ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.frags_cache_name = ip6_frag_cache_name; - ret = inet_frags_init(&ip6_frags); - if (ret) - goto err_pernet; out: return ret; @@ -771,6 +772,8 @@ err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); +err_protocol: + inet_frags_fini(&ip6_frags); goto out; } -- cgit v1.2.3-70-g09d2 From 807f1844df4ac23594268fa9f41902d0549e92aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:46 -0700 Subject: inet: frags: refactor lowpan_net_frag_init() We want to call lowpan_net_frag_init() earlier. Similar to commit "inet: frags: refactor ipv6_frag_init()" This is a prereq to "inet: frags: use rhashtables for reassembly units" Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ieee802154/6lowpan/reassembly.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6badc055555b..ddada12a044d 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -615,14 +615,6 @@ int __init lowpan_net_frag_init(void) { int ret; - ret = lowpan_frags_sysctl_register(); - if (ret) - return ret; - - ret = register_pernet_subsys(&lowpan_frags_ops); - if (ret) - goto err_pernet; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; @@ -632,11 +624,21 @@ int __init lowpan_net_frag_init(void) lowpan_frags.frags_cache_name = lowpan_frags_cache_name; ret = inet_frags_init(&lowpan_frags); if (ret) - goto err_pernet; + goto out; + ret = lowpan_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&lowpan_frags_ops); + if (ret) + goto err_pernet; +out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); +err_sysctl: + inet_frags_fini(&lowpan_frags); return ret; } -- cgit v1.2.3-70-g09d2 From 483a6e4fa055123142d8956866fe2aa9c98d546d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:47 -0700 Subject: inet: frags: refactor ipfrag_init() We need to call inet_frags_init() before register_pernet_subsys(), as a prereq for following patch ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index cd2b4c9419fc..1a3bc85d6f5e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -896,8 +896,6 @@ static struct pernet_operations ip4_frags_ops = { void __init ipfrag_init(void) { - ip4_frags_ctl_register(); - register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; @@ -907,4 +905,6 @@ void __init ipfrag_init(void) ip4_frags.frags_cache_name = ip_frag_cache_name; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); } -- cgit v1.2.3-70-g09d2 From 648700f76b03b7e8149d13cc2bdb3355035258a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:49 -0700 Subject: inet: frags: use rhashtables for reassembly units Some applications still rely on IP fragmentation, and to be fair linux reassembly unit is not working under any serious load. It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) A work queue is supposed to garbage collect items when host is under memory pressure, and doing a hash rebuild, changing seed used in hash computations. This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, occurring every 5 seconds if host is under fire. Then there is the problem of sharing this hash table for all netns. It is time to switch to rhashtables, and allocate one of them per netns to speedup netns dismantle, since this is a critical metric these days. Lookup is now using RCU. A followup patch will even remove the refcount hold/release left from prior implementation and save a couple of atomic operations. Before this patch, 16 cpus (16 RX queue NIC) could not handle more than 1 Mpps frags DDOS. After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB of storage for the fragments (exact number depends on frags being evicted after timeout) $ grep FRAG /proc/net/sockstat FRAG: inuse 1966916 memory 2140004608 A followup patch will change the limits for 64bit arches. Signed-off-by: Eric Dumazet Cc: Kirill Tkhai Cc: Herbert Xu Cc: Florian Westphal Cc: Jesper Dangaard Brouer Cc: Alexander Aring Cc: Stefan Schmidt Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +- include/net/inet_frag.h | 81 ++++---- include/net/ipv6.h | 16 +- net/ieee802154/6lowpan/6lowpan_i.h | 26 +-- net/ieee802154/6lowpan/reassembly.c | 91 ++++----- net/ipv4/inet_fragment.c | 344 ++++++-------------------------- net/ipv4/ip_fragment.c | 112 +++++------ net/ipv6/netfilter/nf_conntrack_reasm.c | 51 ++--- net/ipv6/reassembly.c | 110 +++++----- 9 files changed, 265 insertions(+), 573 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 33f35f049ad5..6f2a3670e44b 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -134,13 +134,10 @@ min_adv_mss - INTEGER IP Fragmentation: ipfrag_high_thresh - INTEGER - Maximum memory used to reassemble IP fragments. When - ipfrag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ipfrag_low_thresh - is reached. This also serves as a maximum limit to namespaces - different from the initial one. + Maximum memory used to reassemble IP fragments. ipfrag_low_thresh - INTEGER + (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. The kernel still accepts new fragments for defragmentation. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 69e531ed8189..3fec0d3a0d01 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,7 +2,11 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ +#include + struct netns_frags { + struct rhashtable rhashtable ____cacheline_aligned_in_smp; + /* Keep atomic mem on separate cachelines in structs that include it */ atomic_t mem ____cacheline_aligned_in_smp; /* sysctls */ @@ -26,12 +30,30 @@ enum { INET_FRAG_COMPLETE = BIT(2), }; +struct frag_v4_compare_key { + __be32 saddr; + __be32 daddr; + u32 user; + u32 vif; + __be16 id; + u16 protocol; +}; + +struct frag_v6_compare_key { + struct in6_addr saddr; + struct in6_addr daddr; + u32 user; + __be32 id; + u32 iif; +}; + /** * struct inet_frag_queue - fragment queue * - * @lock: spinlock protecting the queue + * @node: rhash node + * @key: keys identifying this frag. * @timer: queue expiration timer - * @list: hash bucket list + * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail @@ -41,12 +63,16 @@ enum { * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) + * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { - spinlock_t lock; + struct rhash_head node; + union { + struct frag_v4_compare_key v4; + struct frag_v6_compare_key v6; + } key; struct timer_list timer; - struct hlist_node list; + spinlock_t lock; refcount_t refcnt; struct sk_buff *fragments; struct sk_buff *fragments_tail; @@ -55,51 +81,20 @@ struct inet_frag_queue { int meat; __u8 flags; u16 max_size; - struct netns_frags *net; - struct hlist_node list_evictor; -}; - -#define INETFRAGS_HASHSZ 1024 - -/* averaged: - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or - * struct frag_queue)) - */ -#define INETFRAGS_MAXDEPTH 128 - -struct inet_frag_bucket { - struct hlist_head chain; - spinlock_t chain_lock; + struct netns_frags *net; + struct rcu_head rcu; }; struct inet_frags { - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - - struct work_struct frags_work; - unsigned int next_bucket; - unsigned long last_rebuild_jiffies; - bool rebuild; - - /* The first call to hashfn is responsible to initialize - * rnd. This is best done with net_get_random_once. - * - * rnd_seqlock is used to let hash insertion detect - * when it needs to re-lookup the hash chain to use. - */ - u32 rnd; - seqlock_t rnd_seqlock; unsigned int qsize; - unsigned int (*hashfn)(const struct inet_frag_queue *); - bool (*match)(const struct inet_frag_queue *q, - const void *arg); void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; + struct rhashtable_params rhash_params; }; int inet_frags_init(struct inet_frags *); @@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { atomic_set(&nf->mem, 0); - return 0; + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash); - +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); @@ -128,7 +121,7 @@ static inline void inet_frag_put(struct inet_frag_queue *q) static inline bool inet_frag_evicting(struct inet_frag_queue *q) { - return !hlist_unhashed(&q->list_evictor); + return false; } /* Memory Tracking Functions. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 57b7fe43d2ab..6fa9a2bc5896 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -579,17 +579,8 @@ enum ip6_defrag_users { __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; -struct ip6_create_arg { - __be32 id; - u32 user; - const struct in6_addr *src; - const struct in6_addr *dst; - int iif; - u8 ecn; -}; - void ip6_frag_init(struct inet_frag_queue *q, const void *a); -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); +extern const struct rhashtable_params ip6_rhash_params; /* * Equivalent of ipv4 struct ip @@ -597,11 +588,6 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); struct frag_queue { struct inet_frag_queue q; - __be32 id; /* fragment id */ - u32 user; - struct in6_addr saddr; - struct in6_addr daddr; - int iif; __u16 nhoffset; u8 ecn; diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index d8de3bcfb103..b8d95cb71c25 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; #define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAGN 0xe0 -struct lowpan_create_arg { +struct frag_lowpan_compare_key { u16 tag; u16 d_size; - const struct ieee802154_addr *src; - const struct ieee802154_addr *dst; + const struct ieee802154_addr src; + const struct ieee802154_addr dst; }; -/* Equivalent of ipv4 struct ip +/* Equivalent of ipv4 struct ipq */ struct lowpan_frag_queue { struct inet_frag_queue q; - - u16 tag; - u16 d_size; - struct ieee802154_addr saddr; - struct ieee802154_addr daddr; }; -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) -{ - switch (a->mode) { - case IEEE802154_ADDR_LONG: - return (((__force u64)a->extended_addr) >> 32) ^ - (((__force u64)a->extended_addr) & 0xffffffff); - case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr + (a->pan_id << 16)); - default: - return 0; - } -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index ddada12a044d..0fa0121f85d4 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, - const struct ieee802154_addr *saddr, - const struct ieee802154_addr *daddr) -{ - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - return jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); -} - -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) -{ - const struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); -} - -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct lowpan_frag_queue *fq; - const struct lowpan_create_arg *arg = a; - - fq = container_of(q, struct lowpan_frag_queue, q); - return fq->tag == arg->tag && fq->d_size == arg->d_size && - ieee802154_addr_equal(&fq->saddr, arg->src) && - ieee802154_addr_equal(&fq->daddr, arg->dst); -} - static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { - const struct lowpan_create_arg *arg = a; + const struct frag_lowpan_compare_key *key = a; struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); - fq->tag = arg->tag; - fq->d_size = arg->d_size; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); + memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) @@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { - struct inet_frag_queue *q; - struct lowpan_create_arg arg; - unsigned int hash; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + struct frag_lowpan_compare_key key = { + .tag = cb->d_tag, + .d_size = cb->d_size, + .src = *src, + .dst = *dst, + }; + struct inet_frag_queue *q; - arg.tag = cb->d_tag; - arg.d_size = cb->d_size; - arg.src = src; - arg.dst = dst; - - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); - - q = inet_frag_find(&ieee802154_lowpan->frags, - &lowpan_frags, &arg, hash); + q = inet_frag_find(&ieee802154_lowpan->frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = { .exit = lowpan_frags_exit_net, }; +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_lowpan_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params lowpan_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = lowpan_key_hashfn, + .obj_hashfn = lowpan_obj_hashfn, + .obj_cmpfn = lowpan_obj_cmpfn, + .automatic_shrinking = true, +}; + int __init lowpan_net_frag_init(void) { int ret; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); - lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 1ac69f65d0de..ebb8f411e0db 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include #include -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - if (!hlist_unhashed(&q->list_evictor)) - return false; - - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf) -{ - struct inet_frags *f =nf->f; - unsigned int seq; - int i; - - nf->low_thresh = 0; - -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) +static void inet_frags_free_cb(void *ptr, void *arg) { - struct inet_frag_bucket *hb; - unsigned int seq, hash; + struct inet_frag_queue *fq = ptr; - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, fq->net->f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); void inet_frag_kill(struct inet_frag_queue *fq) { @@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); @@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix) { static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; + " list length grew over limit. Dropping fragment.\n"; if (PTR_ERR(q) == -ENOBUFS) net_dbg_ratelimited("%s%s", prefix, msg); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1a3bc85d6f5e..4021820db6f2 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -69,15 +69,9 @@ struct ipfrag_skb_cb struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t) /* Only an end host needs to send an ICMP * "Fragment Reassembly Timeout" message, per RFC792. */ - if (frag_expire_skip_icmp(qp->user) && + if (frag_expire_skip_icmp(qp->q.key.v4.user) && (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; @@ -262,17 +216,17 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); + q = inet_frag_find(&net->ipv4.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -656,7 +610,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); ip4_frags_ctl_register(); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c4b40fdee838..0ad3df551d98 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} - - -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t) } /* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, int iif, u8 ecn) +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = user, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - - local_bh_disable(); - hash = nf_hash_frag(id, src, dst); - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); - local_bh_enable(); + q = inet_frag_find(&net->nf_frag.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; @@ -660,13 +636,12 @@ int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; + nf_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f0071b113a92..3fc853e4492a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -79,52 +79,13 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -/* - * callers should be careful not to use the hash value outside the ipfrag_lock - * as doing so could race with ipfrag_hash_rnd being recalculated. - */ -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); -} - -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *fq; - - fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); -} - -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct frag_queue *fq; - const struct ip6_create_arg *arg = a; - - fq = container_of(q, struct frag_queue, q); - return fq->id == arg->id && - fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst) && - (arg->iif == fq->iif || - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | - IPV6_ADDR_LINKLOCAL))); -} -EXPORT_SYMBOL(ip6_frag_match); - void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct ip6_create_arg *arg = a; + const struct frag_v6_compare_key *key = a; - fq->id = arg->id; - fq->user = arg->user; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; - fq->ecn = arg->ecn; + q->key.v6 = *key; + fq->ecn = 0; } EXPORT_SYMBOL(ip6_frag_init); @@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t) } static struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, int iif, u8 ecn) +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = IP6_DEFRAG_LOCAL_DELIVER, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = IP6_DEFRAG_LOCAL_DELIVER; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - hash = inet6_hash_frag(id, src, dst); + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); + q = inet_frag_find(&net->ipv6.frags, &key); if (IS_ERR_OR_NULL(q)) { inet_frag_maybe_warn_overflow(q, pr_fmt()); return NULL; @@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { int ret; spin_lock(&fq->q.lock); + fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); @@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6_key_hashfn, + .obj_hashfn = ip6_obj_hashfn, + .obj_cmpfn = ip6_obj_cmpfn, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL(ip6_rhash_params); + int __init ipv6_frag_init(void) { int ret; - ip6_frags.hashfn = ip6_hashfn; ip6_frags.constructor = ip6_frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; ip6_frags.frag_expire = ip6_frag_expire; ip6_frags.frags_cache_name = ip6_frag_cache_name; + ip6_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&ip6_frags); if (ret) goto out; -- cgit v1.2.3-70-g09d2 From 6befe4a78b1553edb6eed3a78b4bcd9748526672 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:50 -0700 Subject: inet: frags: remove some helpers Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem() Also since we use rhashtable we can bring back the number of fragments in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was removed in commit 434d305405ab ("inet: frag: don't account number of fragment queues") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 5 ----- include/net/ip.h | 1 - include/net/ipv6.h | 7 ------- net/ipv4/ip_fragment.c | 5 ----- net/ipv4/proc.c | 6 +++--- net/ipv6/proc.c | 5 +++-- 6 files changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 3fec0d3a0d01..4b5449df0aad 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -141,11 +141,6 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, int i) atomic_add(i, &nf->mem); } -static inline int sum_frag_mem_limit(struct netns_frags *nf) -{ - return atomic_read(&nf->mem); -} - /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. diff --git a/include/net/ip.h b/include/net/ip.h index 36f8f7811093..ecffd843e7b8 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s return skb; } #endif -int ip_frag_mem(struct net *net); /* * Functions provided by ip_forward.c diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6fa9a2bc5896..37455e840347 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) idev->cnf.accept_ra; } -#if IS_ENABLED(CONFIG_IPV6) -static inline int ip6_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv6.frags); -} -#endif - #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4021820db6f2..44f4fa306e22 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv4.frags); -} - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index adfb75340275..aacfce0d7d82 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,7 +54,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem; int orphans, sockets; orphans = percpu_counter_sum_positive(&tcp_orphan_count); @@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - frag_mem = ip_frag_mem(net); - seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG: inuse %u memory %u\n", + atomic_read(&net->ipv4.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv4.frags)); return 0; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6e57028d2e91..8befeb91e071 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -38,7 +38,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG6: inuse %u memory %u\n", + atomic_read(&net->ipv6.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv6.frags)); return 0; } -- cgit v1.2.3-70-g09d2 From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:51 -0700 Subject: inet: frags: get rif of inet_frag_evicting() This refactors ip_expire() since one indentation level is removed. Note: in the future, we should try hard to avoid the skb_clone() since this is a serious performance cost. Under DDOS, the ICMP message wont be sent because of rate limits. Fact that ip6_expire_frag_queue() does not use skb_clone() is disturbing too. Presumably IPv6 should have the same issue than the one we fixed in commit ec4fbd64751d ("inet: frag: release spinlock before calling icmp_send()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 5 ---- net/ipv4/ip_fragment.c | 65 ++++++++++++++++++++++++------------------------- net/ipv6/reassembly.c | 4 --- 3 files changed, 32 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 4b5449df0aad..0e8e159d88f7 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q) inet_frag_destroy(q); } -static inline bool inet_frag_evicting(struct inet_frag_queue *q) -{ - return false; -} - /* Memory Tracking Functions. */ static inline int frag_mem_limit(struct netns_frags *nf) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 44f4fa306e22..b844f517b75b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct ipq *qp; + struct sk_buff *clone, *head; + const struct iphdr *iph; struct net *net; + struct ipq *qp; + int err; qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); @@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *clone, *head = qp->q.fragments; - const struct iphdr *iph; - int err; + head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); - if (err) - goto out; + if (err) + goto out; - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->q.key.v4.user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - - clone = skb_clone(head, GFP_ATOMIC); - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } + /* Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (frag_expire_skip_icmp(qp->q.key.v4.user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) + goto out; + + clone = skb_clone(head, GFP_ATOMIC); + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + if (clone) { + spin_unlock(&qp->q.lock); + icmp_send(clone, ICMP_TIME_EXCEEDED, + ICMP_EXC_FRAGTIME, 0); + consume_skb(clone); + goto out_rcu_unlock; } out: spin_unlock(&qp->q.lock); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 3fc853e4492a..70acad126d04 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out_rcu_unlock; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - - if (inet_frag_evicting(&fq->q)) - goto out_rcu_unlock; - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ -- cgit v1.2.3-70-g09d2 From 2d44ed22e607f9a285b049de2263e3840673a260 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:52 -0700 Subject: inet: frags: remove inet_frag_maybe_warn_overflow() This function is obsolete, after rhashtable addition to inet defrag. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 -- net/ieee802154/6lowpan/reassembly.c | 5 ++--- net/ipv4/inet_fragment.c | 11 ----------- net/ipv4/ip_fragment.c | 5 ++--- net/ipv6/netfilter/nf_conntrack_reasm.c | 5 ++--- net/ipv6/reassembly.c | 5 ++--- 6 files changed, 8 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 0e8e159d88f7..95e353e3305b 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q) { diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 0fa0121f85d4..1aec71a3f904 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, struct inet_frag_queue *q; q = inet_frag_find(&ieee802154_lowpan->frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct lowpan_frag_queue, q); } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index ebb8f411e0db..c9e35b81d093 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); - -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) -{ - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit. Dropping fragment.\n"; - - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); -} -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b844f517b75b..b0366224f314 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, struct inet_frag_queue *q; q = inet_frag_find(&net->ipv4.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct ipq, q); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0ad3df551d98..d866412b8f6c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, struct inet_frag_queue *q; q = inet_frag_find(&net->nf_frag.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 70acad126d04..2a77fda5e3bc 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) key.iif = 0; q = inet_frag_find(&net->ipv6.frags, &key); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } -- cgit v1.2.3-70-g09d2 From 3e67f106f619dcfaf6f4e2039599bdb69848c714 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:53 -0700 Subject: inet: frags: break the 2GB limit for frags storage Some users are willing to provision huge amounts of memory to be able to perform reassembly reasonnably well under pressure. Current memory tracking is using one atomic_t and integers. Switch to atomic_long_t so that 64bit arches can use more than 2GB, without any cost for 32bit arches. Note that this patch avoids an overflow error, if high_thresh was set to ~2GB, since this test in inet_frag_alloc() was never true : if (... || frag_mem_limit(nf) > nf->high_thresh) Tested: $ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh $ grep FRAG /proc/net/sockstat FRAG: inuse 14705885 memory 16000002880 $ nstat -n ; sleep 1 ; nstat | grep Reas IpReasmReqds 3317150 0.0 IpReasmFails 3317112 0.0 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 4 ++-- include/net/inet_frag.h | 20 ++++++++++---------- net/ieee802154/6lowpan/reassembly.c | 10 +++++----- net/ipv4/ip_fragment.c | 10 +++++----- net/ipv4/proc.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++----- net/ipv6/proc.c | 2 +- net/ipv6/reassembly.c | 6 +++--- 8 files changed, 32 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 6f2a3670e44b..5dc1a040a2f1 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -133,10 +133,10 @@ min_adv_mss - INTEGER IP Fragmentation: -ipfrag_high_thresh - INTEGER +ipfrag_high_thresh - LONG INTEGER Maximum memory used to reassemble IP fragments. -ipfrag_low_thresh - INTEGER +ipfrag_low_thresh - LONG INTEGER (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 95e353e3305b..a52e7273e7a5 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -8,11 +8,11 @@ struct netns_frags { struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ - atomic_t mem ____cacheline_aligned_in_smp; + atomic_long_t mem ____cacheline_aligned_in_smp; /* sysctls */ + long high_thresh; + long low_thresh; int timeout; - int high_thresh; - int low_thresh; int max_dist; struct inet_frags *f; }; @@ -102,7 +102,7 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { - atomic_set(&nf->mem, 0); + atomic_long_set(&nf->mem, 0); return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } void inet_frags_exit_net(struct netns_frags *nf); @@ -119,19 +119,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q) /* Memory Tracking Functions. */ -static inline int frag_mem_limit(struct netns_frags *nf) +static inline long frag_mem_limit(const struct netns_frags *nf) { - return atomic_read(&nf->mem); + return atomic_long_read(&nf->mem); } -static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) +static inline void sub_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_sub(i, &nf->mem); + atomic_long_sub(val, &nf->mem); } -static inline void add_frag_mem_limit(struct netns_frags *nf, int i) +static inline void add_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_add(i, &nf->mem); + atomic_long_add(val, &nf->mem); } /* RFC 3168 support : diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 1aec71a3f904..44f148a6bb57 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -411,23 +411,23 @@ err: } #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b0366224f314..053869f2c49b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -678,23 +678,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv4.frags.high_thresh }, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index aacfce0d7d82..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - seq_printf(seq, "FRAG: inuse %u memory %u\n", + seq_printf(seq, "FRAG: inuse %u memory %lu\n", atomic_read(&net->ipv4.frags.rhashtable.nelems), frag_mem_limit(&net->ipv4.frags)); return 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index d866412b8f6c..603a39592859 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -63,7 +63,7 @@ struct nf_ct_frag6_skb_cb static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { @@ -76,18 +76,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_low_thresh", .data = &init_net.nf_frag.frags.low_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.nf_frag.frags.low_thresh }, { } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 8befeb91e071..a85f7e0b14b1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -47,7 +47,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", + seq_printf(seq, "FRAG6: inuse %u memory %lu\n", atomic_read(&net->ipv6.frags.rhashtable.nelems), frag_mem_limit(&net->ipv6.frags)); return 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2a77fda5e3bc..905a8aee2671 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -552,15 +552,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, -- cgit v1.2.3-70-g09d2 From 1eec5d5670084ee644597bd26c25e22c69b9f748 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:54 -0700 Subject: inet: frags: do not clone skb in ip_expire() An skb_clone() was added in commit ec4fbd64751d ("inet: frag: release spinlock before calling icmp_send()") While fixing the bug at that time, it also added a very high cost for DDOS frags, as the ICMP rate limit is applied after this expensive operation (skb_clone() + consume_skb(), implying memory allocations, copy, and freeing) We can use skb_get(head) here, all we want is to make sure skb wont be freed by another cpu. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 053869f2c49b..fb185d9a5cc7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -143,8 +143,8 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct sk_buff *clone, *head; const struct iphdr *iph; + struct sk_buff *head; struct net *net; struct ipq *qp; int err; @@ -187,16 +187,12 @@ static void ip_expire(struct timer_list *t) (skb_rtable(head)->rt_type != RTN_LOCAL)) goto out; - clone = skb_clone(head, GFP_ATOMIC); + skb_get(head); + spin_unlock(&qp->q.lock); + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } out: spin_unlock(&qp->q.lock); out_rcu_unlock: -- cgit v1.2.3-70-g09d2 From 05c0b86b9696802fd0ce5676a92a63f1b455bdf3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:55 -0700 Subject: ipv6: frags: rewrite ip6_expire_frag_queue() Make it similar to IPv4 ip_expire(), and release the lock before calling icmp functions. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 905a8aee2671..2127da130dc2 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -92,7 +92,9 @@ EXPORT_SYMBOL(ip6_frag_init); void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; + struct sk_buff *head; + rcu_read_lock(); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) @@ -100,28 +102,34 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) inet_frag_kill(&fq->q); - rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) - goto out_rcu_unlock; + goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) - goto out_rcu_unlock; + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; /* But use as source device on which LAST ARRIVED * segment was received. And do not use fq->dev * pointer directly, device might already disappeared. */ - fq->q.fragments->dev = dev; - icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + out: spin_unlock(&fq->q.lock); +out_rcu_unlock: + rcu_read_unlock(); inet_frag_put(&fq->q); } EXPORT_SYMBOL(ip6_expire_frag_queue); -- cgit v1.2.3-70-g09d2 From bf66337140c64c27fa37222b7abca7e49d63fb57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:58 -0700 Subject: inet: frags: get rid of ipfrag_skb_cb/FRAG_CB ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately this integer is currently in a different cache line than skb->next, meaning that we use two cache lines per skb when finding the insertion point. By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields in a single cache line and save precious memory bandwidth. Note that after the fast path added by Changli Gao in commit d6bebca92c66 ("fragment: add fast path for in-order fragments") this change wont help the fast path, since we still need to access prev->len (2nd cache line), but will show great benefits when slow path is entered, since we perform a linear scan of a potentially long list. Also, note that this potential long list is an attack vector, we might consider also using an rb-tree there eventually. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/ipv4/ip_fragment.c | 35 ++++++++++++++--------------------- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47082f54ec1f..9065477ed255 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -672,6 +672,7 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; + int ip_defrag_offset; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb185d9a5cc7..994fa70a910f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,14 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -353,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -370,7 +362,7 @@ found: * any overlaps are eliminated. */ if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; + int i = (prev->ip_defrag_offset + prev->len) - offset; if (i > 0) { offset += i; @@ -387,8 +379,8 @@ found: err = -ENOMEM; - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + while (next && next->ip_defrag_offset < end) { + int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment @@ -396,7 +388,7 @@ found: */ if (!pskb_pull(next, i)) goto err; - FRAG_CB(next)->offset += i; + next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; @@ -420,7 +412,13 @@ found: } } - FRAG_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + qp->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -431,11 +429,6 @@ found: else qp->q.fragments = skb; - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -511,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); -- cgit v1.2.3-70-g09d2 From 219badfaade986a2c3d99abd4eb6d83f4f9ed2fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:59 -0700 Subject: ipv6: frags: get rid of ip6frag_skb_cb/FRAG6_CB ip6_frag_queue uses skb->cb[] to store the fragment offset, meaning that we could use two cache lines per skb when finding the insertion point, if for some reason inet6_skb_parm size is increased in the future. By using skb->ip_defrag_offset instead of skb->cb[], we pack all the fields in a single cache line, matching what we did for IPv4. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2127da130dc2..7b52efc63f6a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -62,13 +62,6 @@ static const char ip6_frag_cache_name[] = "ip6-frags"; -struct ip6frag_skb_cb { - struct inet6_skb_parm h; - int offset; -}; - -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) - static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -250,13 +243,13 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -271,14 +264,20 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + fq->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -289,11 +288,6 @@ found: else fq->q.fragments = skb; - dev = skb->dev; - if (dev) { - fq->iif = dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -380,7 +374,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } WARN_ON(head == NULL); - WARN_ON(FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - -- cgit v1.2.3-70-g09d2 From f2d1c724fca176ddbd32a9397df49221afbcb227 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:59:00 -0700 Subject: inet: frags: get rid of nf_ct_frag6_skb_cb/NFCT_FRAG6_CB nf_ct_frag6_queue() uses skb->cb[] to store the fragment offset, meaning that we could use two cache lines per skb when finding the insertion point, if for some reason inet6_skb_parm size is increased in the future. By using skb->ip_defrag_offset instead of skb->cb[] we pack all the fields in a single cache line, matching what we did for IPv4. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 603a39592859..3622aac343ae 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -52,14 +52,6 @@ static const char nf_frags_cache_name[] = "nf-frags"; -struct nf_ct_frag6_skb_cb -{ - struct inet6_skb_parm h; - int offset; -}; - -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) - static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL @@ -270,13 +262,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (NFCT_FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -292,14 +284,19 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && NFCT_FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - NFCT_FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + if (skb->dev) + fq->iif = skb->dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -310,10 +307,6 @@ found: else fq->q.fragments = skb; - if (skb->dev) { - fq->iif = skb->dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -357,7 +350,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic inet_frag_kill(&fq->q); WARN_ON(head == NULL); - WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) -- cgit v1.2.3-70-g09d2 From dd0bed1665d6ca17efd747a90a0bb804b4bf2005 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Sat, 31 Mar 2018 21:41:52 +0530 Subject: tls: support for Inline tls record Facility to register Inline TLS drivers to net/tls. Setup TLS_HW_RECORD prot to listen on offload device. Cases handled - Inline TLS device exists, setup prot for TLS_HW_RECORD - Atleast one Inline TLS exists, sets TLS_HW_RECORD. - If non-inline device establish connection, move to TLS_SW_TX Signed-off-by: Atul Gupta Reviewed-by: Steve Wise Signed-off-by: David S. Miller --- include/net/tls.h | 32 ++++++++++++++- net/tls/tls_main.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 142 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 437a746300bf..3da8e13a6d96 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -56,6 +56,32 @@ #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 +#define TLS_DEVICE_NAME_MAX 32 + +/* + * This structure defines the routines for Inline TLS driver. + * The following routines are optional and filled with a + * null pointer if not defined. + * + * @name: Its the name of registered Inline tls device + * @dev_list: Inline tls device list + * int (*feature)(struct tls_device *device); + * Called to return Inline TLS driver capability + * + * int (*hash)(struct tls_device *device, struct sock *sk); + * This function sets Inline driver for listen and program + * device specific functioanlity as required + * + * void (*unhash)(struct tls_device *device, struct sock *sk); + * This function cleans listen state set by Inline TLS driver + */ +struct tls_device { + char name[TLS_DEVICE_NAME_MAX]; + struct list_head dev_list; + int (*feature)(struct tls_device *device); + int (*hash)(struct tls_device *device, struct sock *sk); + void (*unhash)(struct tls_device *device, struct sock *sk); +}; struct tls_sw_context { struct crypto_aead *aead_send; @@ -114,7 +140,7 @@ struct tls_context { void *priv_ctx; - u8 conf:2; + u8 conf:3; struct cipher_context tx; struct cipher_context rx; @@ -135,6 +161,8 @@ struct tls_context { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); + int (*hash)(struct sock *sk); + void (*unhash)(struct sock *sk); }; int wait_on_pending_writer(struct sock *sk, long *timeo); @@ -283,5 +311,7 @@ static inline struct tls_offload_context *tls_offload_ctx( int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); +void tls_register_device(struct tls_device *device); +void tls_unregister_device(struct tls_device *device); #endif /* _TLS_OFFLOAD_H */ diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6f5c1146da4a..0d379970960e 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -56,11 +57,14 @@ enum { TLS_SW_TX, TLS_SW_RX, TLS_SW_RXTX, + TLS_HW_RECORD, TLS_NUM_CONFIG, }; static struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; @@ -241,8 +245,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) lock_sock(sk); sk_proto_close = ctx->sk_proto_close; + if (ctx->conf == TLS_HW_RECORD) + goto skip_tx_cleanup; + if (ctx->conf == TLS_BASE) { kfree(ctx); + ctx = NULL; goto skip_tx_cleanup; } @@ -276,6 +284,11 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) skip_tx_cleanup: release_sock(sk); sk_proto_close(sk, timeout); + /* free ctx for TLS_HW_RECORD, used by tcp_set_state + * for sk->sk_prot->unhash [tls_hw_unhash] + */ + if (ctx && ctx->conf == TLS_HW_RECORD) + kfree(ctx); } static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, @@ -493,6 +506,79 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static struct tls_context *create_ctx(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + icsk->icsk_ulp_data = ctx; + return ctx; +} + +static int tls_hw_prot(struct sock *sk) +{ + struct tls_context *ctx; + struct tls_device *dev; + int rc = 0; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->feature && dev->feature(dev)) { + ctx = create_ctx(sk); + if (!ctx) + goto out; + + ctx->hash = sk->sk_prot->hash; + ctx->unhash = sk->sk_prot->unhash; + ctx->sk_proto_close = sk->sk_prot->close; + ctx->conf = TLS_HW_RECORD; + update_sk_prot(sk, ctx); + rc = 1; + break; + } + } +out: + mutex_unlock(&device_mutex); + return rc; +} + +static void tls_hw_unhash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->unhash) + dev->unhash(dev, sk); + } + mutex_unlock(&device_mutex); + ctx->unhash(sk); +} + +static int tls_hw_hash(struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_device *dev; + int err; + + err = ctx->hash(sk); + mutex_lock(&device_mutex); + list_for_each_entry(dev, &device_list, dev_list) { + if (dev->hash) + err |= dev->hash(dev, sk); + } + mutex_unlock(&device_mutex); + + if (err) + tls_hw_unhash(sk); + return err; +} + static void build_protos(struct proto *prot, struct proto *base) { prot[TLS_BASE] = *base; @@ -511,15 +597,22 @@ static void build_protos(struct proto *prot, struct proto *base) prot[TLS_SW_RXTX] = prot[TLS_SW_TX]; prot[TLS_SW_RXTX].recvmsg = tls_sw_recvmsg; prot[TLS_SW_RXTX].close = tls_sk_proto_close; + + prot[TLS_HW_RECORD] = *base; + prot[TLS_HW_RECORD].hash = tls_hw_hash; + prot[TLS_HW_RECORD].unhash = tls_hw_unhash; + prot[TLS_HW_RECORD].close = tls_sk_proto_close; } static int tls_init(struct sock *sk) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; - struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; int rc = 0; + if (tls_hw_prot(sk)) + goto out; + /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. * Supporting sockets in LISTEN state will require us @@ -530,12 +623,11 @@ static int tls_init(struct sock *sk) return -ENOTSUPP; /* allocate tls context */ - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = create_ctx(sk); if (!ctx) { rc = -ENOMEM; goto out; } - icsk->icsk_ulp_data = ctx; ctx->setsockopt = sk->sk_prot->setsockopt; ctx->getsockopt = sk->sk_prot->getsockopt; ctx->sk_proto_close = sk->sk_prot->close; @@ -557,6 +649,22 @@ out: return rc; } +void tls_register_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_add_tail(&device->dev_list, &device_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_register_device); + +void tls_unregister_device(struct tls_device *device) +{ + mutex_lock(&device_mutex); + list_del(&device->dev_list); + mutex_unlock(&device_mutex); +} +EXPORT_SYMBOL(tls_unregister_device); + static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .uid = TCP_ULP_TLS, -- cgit v1.2.3-70-g09d2 From e0be6bea2583486ec4ed98e36437d82ea8190811 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Sat, 31 Mar 2018 21:41:53 +0530 Subject: ethtool: enable Inline TLS in HW Ethtool option enables TLS record offload on HW, user configures the feature for netdev capable of Inline TLS. This allows user to define custom sk_prot for Inline TLS sock Signed-off-by: Atul Gupta Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/ethtool.c | 1 + 2 files changed, 3 insertions(+) (limited to 'net') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index db84c516bcfb..35b79f47a13d 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -79,6 +79,7 @@ enum { NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ + NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ /* * Add your fresh new feature above and remember to update @@ -145,6 +146,7 @@ enum { #define NETIF_F_HW_ESP __NETIF_F(HW_ESP) #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) +#define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index eb55252ca1fb..03416e6dd5d7 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -108,6 +108,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", + [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", }; static const char -- cgit v1.2.3-70-g09d2 From cc35c88ae4db219611e204375d6a4248bc0e84d6 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Sat, 31 Mar 2018 21:41:59 +0530 Subject: crypto : chtls - CPL handler definition Exchange messages with hardware to program the TLS session CPL handlers for messages received from chip. Signed-off-by: Atul Gupta Signed-off-by: Michael Werner Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chtls/chtls_cm.c | 2126 +++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 1 + 2 files changed, 2127 insertions(+) create mode 100644 drivers/crypto/chelsio/chtls/chtls_cm.c (limited to 'net') diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c new file mode 100644 index 000000000000..82a473a0cefa --- /dev/null +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -0,0 +1,2126 @@ +/* + * Copyright (c) 2018 Chelsio Communications, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Written by: Atul Gupta (atul.gupta@chelsio.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "chtls.h" +#include "chtls_cm.h" + +/* + * State transitions and actions for close. Note that if we are in SYN_SENT + * we remain in that state as we cannot control a connection while it's in + * SYN_SENT; such connections are allowed to establish and are then aborted. + */ +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_SYN_SENT, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static struct chtls_sock *chtls_sock_create(struct chtls_dev *cdev) +{ + struct chtls_sock *csk = kzalloc(sizeof(*csk), GFP_ATOMIC); + + if (!csk) + return NULL; + + csk->txdata_skb_cache = alloc_skb(TXDATA_SKB_LEN, GFP_ATOMIC); + if (!csk->txdata_skb_cache) { + kfree(csk); + return NULL; + } + + kref_init(&csk->kref); + csk->cdev = cdev; + skb_queue_head_init(&csk->txq); + csk->wr_skb_head = NULL; + csk->wr_skb_tail = NULL; + csk->mss = MAX_MSS; + csk->tlshws.ofld = 1; + csk->tlshws.txkey = -1; + csk->tlshws.rxkey = -1; + csk->tlshws.mfs = TLS_MFS; + skb_queue_head_init(&csk->tlshws.sk_recv_queue); + return csk; +} + +static void chtls_sock_release(struct kref *ref) +{ + struct chtls_sock *csk = + container_of(ref, struct chtls_sock, kref); + + kfree(csk); +} + +static struct net_device *chtls_ipv4_netdev(struct chtls_dev *cdev, + struct sock *sk) +{ + struct net_device *ndev = cdev->ports[0]; + + if (likely(!inet_sk(sk)->inet_rcv_saddr)) + return ndev; + + ndev = ip_dev_find(&init_net, inet_sk(sk)->inet_rcv_saddr); + if (!ndev) + return NULL; + + if (is_vlan_dev(ndev)) + return vlan_dev_real_dev(ndev); + return ndev; +} + +static void assign_rxopt(struct sock *sk, unsigned int opt) +{ + const struct chtls_dev *cdev; + struct chtls_sock *csk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tp = tcp_sk(sk); + + cdev = csk->cdev; + tp->tcp_header_len = sizeof(struct tcphdr); + tp->rx_opt.mss_clamp = cdev->mtus[TCPOPT_MSS_G(opt)] - 40; + tp->mss_cache = tp->rx_opt.mss_clamp; + tp->rx_opt.tstamp_ok = TCPOPT_TSTAMP_G(opt); + tp->rx_opt.snd_wscale = TCPOPT_SACK_G(opt); + tp->rx_opt.wscale_ok = TCPOPT_WSCALE_OK_G(opt); + SND_WSCALE(tp) = TCPOPT_SND_WSCALE_G(opt); + if (!tp->rx_opt.wscale_ok) + tp->rx_opt.rcv_wscale = 0; + if (tp->rx_opt.tstamp_ok) { + tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED; + tp->rx_opt.mss_clamp -= TCPOLEN_TSTAMP_ALIGNED; + } else if (csk->opt2 & TSTAMPS_EN_F) { + csk->opt2 &= ~TSTAMPS_EN_F; + csk->mtu_idx = TCPOPT_MSS_G(opt); + } +} + +static void chtls_purge_receive_queue(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + skb_dst_set(skb, (void *)NULL); + kfree_skb(skb); + } +} + +static void chtls_purge_write_queue(struct sock *sk) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&csk->txq))) { + sk->sk_wmem_queued -= skb->truesize; + __kfree_skb(skb); + } +} + +static void chtls_purge_recv_queue(struct sock *sk) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + struct chtls_hws *tlsk = &csk->tlshws; + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&tlsk->sk_recv_queue)) != NULL) { + skb_dst_set(skb, NULL); + kfree_skb(skb); + } +} + +static void abort_arp_failure(void *handle, struct sk_buff *skb) +{ + struct cpl_abort_req *req = cplhdr(skb); + struct chtls_dev *cdev; + + cdev = (struct chtls_dev *)handle; + req->cmd = CPL_ABORT_NO_RST; + cxgb4_ofld_send(cdev->lldi->ports[0], skb); +} + +static struct sk_buff *alloc_ctrl_skb(struct sk_buff *skb, int len) +{ + if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) { + __skb_trim(skb, 0); + refcount_add(2, &skb->users); + } else { + skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL); + } + return skb; +} + +static void chtls_send_abort(struct sock *sk, int mode, struct sk_buff *skb) +{ + struct cpl_abort_req *req; + struct chtls_sock *csk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tp = tcp_sk(sk); + + if (!skb) + skb = alloc_ctrl_skb(csk->txdata_skb_cache, sizeof(*req)); + + req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req)); + INIT_TP_WR_CPL(req, CPL_ABORT_REQ, csk->tid); + skb_set_queue_mapping(skb, (csk->txq_idx << 1) | CPL_PRIORITY_DATA); + req->rsvd0 = htonl(tp->snd_nxt); + req->rsvd1 = !csk_flag_nochk(csk, CSK_TX_DATA_SENT); + req->cmd = mode; + t4_set_arp_err_handler(skb, csk->cdev, abort_arp_failure); + send_or_defer(sk, tp, skb, mode == CPL_ABORT_SEND_RST); +} + +static void chtls_send_reset(struct sock *sk, int mode, struct sk_buff *skb) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + + if (unlikely(csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN) || + !csk->cdev)) { + if (sk->sk_state == TCP_SYN_RECV) + csk_set_flag(csk, CSK_RST_ABORTED); + goto out; + } + + if (!csk_flag_nochk(csk, CSK_TX_DATA_SENT)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0) + WARN_ONCE(1, "send tx flowc error"); + csk_set_flag(csk, CSK_TX_DATA_SENT); + } + + csk_set_flag(csk, CSK_ABORT_RPL_PENDING); + chtls_purge_write_queue(sk); + + csk_set_flag(csk, CSK_ABORT_SHUTDOWN); + if (sk->sk_state != TCP_SYN_RECV) + chtls_send_abort(sk, mode, skb); + else + goto out; + + return; +out: + if (skb) + kfree_skb(skb); +} + +static void release_tcp_port(struct sock *sk) +{ + if (inet_csk(sk)->icsk_bind_hash) + inet_put_port(sk); +} + +static void tcp_uncork(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->nonagle & TCP_NAGLE_CORK) { + tp->nonagle &= ~TCP_NAGLE_CORK; + chtls_tcp_push(sk, 0); + } +} + +static void chtls_close_conn(struct sock *sk) +{ + struct cpl_close_con_req *req; + struct chtls_sock *csk; + struct sk_buff *skb; + unsigned int tid; + unsigned int len; + + len = roundup(sizeof(struct cpl_close_con_req), 16); + csk = rcu_dereference_sk_user_data(sk); + tid = csk->tid; + + skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL); + req = (struct cpl_close_con_req *)__skb_put(skb, len); + memset(req, 0, len); + req->wr.wr_hi = htonl(FW_WR_OP_V(FW_TP_WR) | + FW_WR_IMMDLEN_V(sizeof(*req) - + sizeof(req->wr))); + req->wr.wr_mid = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)) | + FW_WR_FLOWID_V(tid)); + + OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid)); + + tcp_uncork(sk); + skb_entail(sk, skb, ULPCB_FLAG_NO_HDR | ULPCB_FLAG_NO_APPEND); + if (sk->sk_state != TCP_SYN_SENT) + chtls_push_frames(csk, 1); +} + +/* + * Perform a state transition during close and return the actions indicated + * for the transition. Do not make this function inline, the main reason + * it exists at all is to avoid multiple inlining of tcp_set_state. + */ +static int make_close_transition(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + + tcp_set_state(sk, next & TCP_STATE_MASK); + return next & TCP_ACTION_FIN; +} + +void chtls_close(struct sock *sk, long timeout) +{ + int data_lost, prev_state; + struct chtls_sock *csk; + + csk = rcu_dereference_sk_user_data(sk); + + lock_sock(sk); + sk->sk_shutdown |= SHUTDOWN_MASK; + + data_lost = skb_queue_len(&sk->sk_receive_queue); + data_lost |= skb_queue_len(&csk->tlshws.sk_recv_queue); + chtls_purge_recv_queue(sk); + chtls_purge_receive_queue(sk); + + if (sk->sk_state == TCP_CLOSE) { + goto wait; + } else if (data_lost || sk->sk_state == TCP_SYN_SENT) { + chtls_send_reset(sk, CPL_ABORT_SEND_RST, NULL); + release_tcp_port(sk); + goto unlock; + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + sk->sk_prot->disconnect(sk, 0); + } else if (make_close_transition(sk)) { + chtls_close_conn(sk); + } +wait: + if (timeout) + sk_stream_wait_close(sk, timeout); + +unlock: + prev_state = sk->sk_state; + sock_hold(sk); + sock_orphan(sk); + + release_sock(sk); + + local_bh_disable(); + bh_lock_sock(sk); + + if (prev_state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) + goto out; + + if (sk->sk_state == TCP_FIN_WAIT2 && tcp_sk(sk)->linger2 < 0 && + !csk_flag(sk, CSK_ABORT_SHUTDOWN)) { + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC); + if (skb) + chtls_send_reset(sk, CPL_ABORT_SEND_RST, skb); + } + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +/* + * Wait until a socket enters on of the given states. + */ +static int wait_for_states(struct sock *sk, unsigned int states) +{ + DECLARE_WAITQUEUE(wait, current); + struct socket_wq _sk_wq; + long current_timeo; + int err = 0; + + current_timeo = 200; + + /* + * We want this to work even when there's no associated struct socket. + * In that case we provide a temporary wait_queue_head_t. + */ + if (!sk->sk_wq) { + init_waitqueue_head(&_sk_wq.wait); + _sk_wq.fasync_list = NULL; + init_rcu_head_on_stack(&_sk_wq.rcu); + RCU_INIT_POINTER(sk->sk_wq, &_sk_wq); + } + + add_wait_queue(sk_sleep(sk), &wait); + while (!sk_in_state(sk, states)) { + if (!current_timeo) { + err = -EBUSY; + break; + } + if (signal_pending(current)) { + err = sock_intr_errno(current_timeo); + break; + } + set_current_state(TASK_UNINTERRUPTIBLE); + release_sock(sk); + if (!sk_in_state(sk, states)) + current_timeo = schedule_timeout(current_timeo); + __set_current_state(TASK_RUNNING); + lock_sock(sk); + } + remove_wait_queue(sk_sleep(sk), &wait); + + if (rcu_dereference(sk->sk_wq) == &_sk_wq) + sk->sk_wq = NULL; + return err; +} + +int chtls_disconnect(struct sock *sk, int flags) +{ + struct chtls_sock *csk; + struct tcp_sock *tp; + int err; + + tp = tcp_sk(sk); + csk = rcu_dereference_sk_user_data(sk); + chtls_purge_recv_queue(sk); + chtls_purge_receive_queue(sk); + chtls_purge_write_queue(sk); + + if (sk->sk_state != TCP_CLOSE) { + sk->sk_err = ECONNRESET; + chtls_send_reset(sk, CPL_ABORT_SEND_RST, NULL); + err = wait_for_states(sk, TCPF_CLOSE); + if (err) + return err; + } + chtls_purge_recv_queue(sk); + chtls_purge_receive_queue(sk); + tp->max_window = 0xFFFF << (tp->rx_opt.snd_wscale); + return tcp_disconnect(sk, flags); +} + +#define SHUTDOWN_ELIGIBLE_STATE (TCPF_ESTABLISHED | \ + TCPF_SYN_RECV | TCPF_CLOSE_WAIT) +void chtls_shutdown(struct sock *sk, int how) +{ + if ((how & SEND_SHUTDOWN) && + sk_in_state(sk, SHUTDOWN_ELIGIBLE_STATE) && + make_close_transition(sk)) + chtls_close_conn(sk); +} + +void chtls_destroy_sock(struct sock *sk) +{ + struct chtls_sock *csk; + + csk = rcu_dereference_sk_user_data(sk); + chtls_purge_recv_queue(sk); + csk->ulp_mode = ULP_MODE_NONE; + chtls_purge_write_queue(sk); + free_tls_keyid(sk); + kref_put(&csk->kref, chtls_sock_release); + sk->sk_prot = &tcp_prot; + sk->sk_prot->destroy(sk); +} + +static void reset_listen_child(struct sock *child) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(child); + struct sk_buff *skb; + + skb = alloc_ctrl_skb(csk->txdata_skb_cache, + sizeof(struct cpl_abort_req)); + + chtls_send_reset(child, CPL_ABORT_SEND_RST, skb); + sock_orphan(child); + INC_ORPHAN_COUNT(child); + if (child->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(child); +} + +static void chtls_disconnect_acceptq(struct sock *listen_sk) +{ + struct request_sock **pprev; + + pprev = ACCEPT_QUEUE(listen_sk); + while (*pprev) { + struct request_sock *req = *pprev; + + if (req->rsk_ops == &chtls_rsk_ops) { + struct sock *child = req->sk; + + *pprev = req->dl_next; + sk_acceptq_removed(listen_sk); + reqsk_put(req); + sock_hold(child); + local_bh_disable(); + bh_lock_sock(child); + release_tcp_port(child); + reset_listen_child(child); + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + } else { + pprev = &req->dl_next; + } + } +} + +static int listen_hashfn(const struct sock *sk) +{ + return ((unsigned long)sk >> 10) & (LISTEN_INFO_HASH_SIZE - 1); +} + +static struct listen_info *listen_hash_add(struct chtls_dev *cdev, + struct sock *sk, + unsigned int stid) +{ + struct listen_info *p = kmalloc(sizeof(*p), GFP_KERNEL); + + if (p) { + int key = listen_hashfn(sk); + + p->sk = sk; + p->stid = stid; + spin_lock(&cdev->listen_lock); + p->next = cdev->listen_hash_tab[key]; + cdev->listen_hash_tab[key] = p; + spin_unlock(&cdev->listen_lock); + } + return p; +} + +static int listen_hash_find(struct chtls_dev *cdev, + struct sock *sk) +{ + struct listen_info *p; + int stid = -1; + int key; + + key = listen_hashfn(sk); + + spin_lock(&cdev->listen_lock); + for (p = cdev->listen_hash_tab[key]; p; p = p->next) + if (p->sk == sk) { + stid = p->stid; + break; + } + spin_unlock(&cdev->listen_lock); + return stid; +} + +static int listen_hash_del(struct chtls_dev *cdev, + struct sock *sk) +{ + struct listen_info *p, **prev; + int stid = -1; + int key; + + key = listen_hashfn(sk); + prev = &cdev->listen_hash_tab[key]; + + spin_lock(&cdev->listen_lock); + for (p = *prev; p; prev = &p->next, p = p->next) + if (p->sk == sk) { + stid = p->stid; + *prev = p->next; + kfree(p); + break; + } + spin_unlock(&cdev->listen_lock); + return stid; +} + +static void cleanup_syn_rcv_conn(struct sock *child, struct sock *parent) +{ + struct request_sock *req; + struct chtls_sock *csk; + + csk = rcu_dereference_sk_user_data(child); + req = csk->passive_reap_next; + + reqsk_queue_removed(&inet_csk(parent)->icsk_accept_queue, req); + __skb_unlink((struct sk_buff *)&csk->synq, &csk->listen_ctx->synq); + chtls_reqsk_free(req); + csk->passive_reap_next = NULL; +} + +static void chtls_reset_synq(struct listen_ctx *listen_ctx) +{ + struct sock *listen_sk = listen_ctx->lsk; + + while (!skb_queue_empty(&listen_ctx->synq)) { + struct chtls_sock *csk = + container_of((struct synq *)__skb_dequeue + (&listen_ctx->synq), struct chtls_sock, synq); + struct sock *child = csk->sk; + + cleanup_syn_rcv_conn(child, listen_sk); + sock_hold(child); + local_bh_disable(); + bh_lock_sock(child); + release_tcp_port(child); + reset_listen_child(child); + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + } +} + +int chtls_listen_start(struct chtls_dev *cdev, struct sock *sk) +{ + struct net_device *ndev; + struct listen_ctx *ctx; + struct adapter *adap; + struct port_info *pi; + int stid; + int ret; + + if (sk->sk_family != PF_INET) + return -EAGAIN; + + rcu_read_lock(); + ndev = chtls_ipv4_netdev(cdev, sk); + rcu_read_unlock(); + if (!ndev) + return -EBADF; + + pi = netdev_priv(ndev); + adap = pi->adapter; + if (!(adap->flags & FULL_INIT_DONE)) + return -EBADF; + + if (listen_hash_find(cdev, sk) >= 0) /* already have it */ + return -EADDRINUSE; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + __module_get(THIS_MODULE); + ctx->lsk = sk; + ctx->cdev = cdev; + ctx->state = T4_LISTEN_START_PENDING; + skb_queue_head_init(&ctx->synq); + + stid = cxgb4_alloc_stid(cdev->tids, sk->sk_family, ctx); + if (stid < 0) + goto free_ctx; + + sock_hold(sk); + if (!listen_hash_add(cdev, sk, stid)) + goto free_stid; + + ret = cxgb4_create_server(ndev, stid, + inet_sk(sk)->inet_rcv_saddr, + inet_sk(sk)->inet_sport, 0, + cdev->lldi->rxq_ids[0]); + if (ret > 0) + ret = net_xmit_errno(ret); + if (ret) + goto del_hash; + return 0; +del_hash: + listen_hash_del(cdev, sk); +free_stid: + cxgb4_free_stid(cdev->tids, stid, sk->sk_family); + sock_put(sk); +free_ctx: + kfree(ctx); + module_put(THIS_MODULE); + return -EBADF; +} + +void chtls_listen_stop(struct chtls_dev *cdev, struct sock *sk) +{ + struct listen_ctx *listen_ctx; + int stid; + + stid = listen_hash_del(cdev, sk); + if (stid < 0) + return; + + listen_ctx = (struct listen_ctx *)lookup_stid(cdev->tids, stid); + chtls_reset_synq(listen_ctx); + + cxgb4_remove_server(cdev->lldi->ports[0], stid, + cdev->lldi->rxq_ids[0], 0); + chtls_disconnect_acceptq(sk); +} + +static int chtls_pass_open_rpl(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_pass_open_rpl *rpl = cplhdr(skb) + RSS_HDR; + unsigned int stid = GET_TID(rpl); + struct listen_ctx *listen_ctx; + + listen_ctx = (struct listen_ctx *)lookup_stid(cdev->tids, stid); + if (!listen_ctx) + return CPL_RET_BUF_DONE; + + if (listen_ctx->state == T4_LISTEN_START_PENDING) { + listen_ctx->state = T4_LISTEN_STARTED; + return CPL_RET_BUF_DONE; + } + + if (rpl->status != CPL_ERR_NONE) { + pr_info("Unexpected PASS_OPEN_RPL status %u for STID %u\n", + rpl->status, stid); + return CPL_RET_BUF_DONE; + } + cxgb4_free_stid(cdev->tids, stid, listen_ctx->lsk->sk_family); + sock_put(listen_ctx->lsk); + kfree(listen_ctx); + module_put(THIS_MODULE); + + return 0; +} + +static int chtls_close_listsrv_rpl(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_close_listsvr_rpl *rpl = cplhdr(skb) + RSS_HDR; + struct listen_ctx *listen_ctx; + unsigned int stid; + void *data; + + stid = GET_TID(rpl); + data = lookup_stid(cdev->tids, stid); + listen_ctx = (struct listen_ctx *)data; + + if (rpl->status != CPL_ERR_NONE) { + pr_info("Unexpected CLOSE_LISTSRV_RPL status %u for STID %u\n", + rpl->status, stid); + return CPL_RET_BUF_DONE; + } + + cxgb4_free_stid(cdev->tids, stid, listen_ctx->lsk->sk_family); + sock_put(listen_ctx->lsk); + kfree(listen_ctx); + module_put(THIS_MODULE); + + return 0; +} + +static void chtls_release_resources(struct sock *sk) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + struct chtls_dev *cdev = csk->cdev; + unsigned int tid = csk->tid; + struct tid_info *tids; + + if (!cdev) + return; + + tids = cdev->tids; + kfree_skb(csk->txdata_skb_cache); + csk->txdata_skb_cache = NULL; + + if (csk->l2t_entry) { + cxgb4_l2t_release(csk->l2t_entry); + csk->l2t_entry = NULL; + } + + cxgb4_remove_tid(tids, csk->port_id, tid, sk->sk_family); + sock_put(sk); +} + +static void chtls_conn_done(struct sock *sk) +{ + if (sock_flag(sk, SOCK_DEAD)) + chtls_purge_receive_queue(sk); + sk_wakeup_sleepers(sk, 0); + tcp_done(sk); +} + +static void do_abort_syn_rcv(struct sock *child, struct sock *parent) +{ + /* + * If the server is still open we clean up the child connection, + * otherwise the server already did the clean up as it was purging + * its SYN queue and the skb was just sitting in its backlog. + */ + if (likely(parent->sk_state == TCP_LISTEN)) { + cleanup_syn_rcv_conn(child, parent); + /* Without the below call to sock_orphan, + * we leak the socket resource with syn_flood test + * as inet_csk_destroy_sock will not be called + * in tcp_done since SOCK_DEAD flag is not set. + * Kernel handles this differently where new socket is + * created only after 3 way handshake is done. + */ + sock_orphan(child); + percpu_counter_inc((child)->sk_prot->orphan_count); + chtls_release_resources(child); + chtls_conn_done(child); + } else { + if (csk_flag(child, CSK_RST_ABORTED)) { + chtls_release_resources(child); + chtls_conn_done(child); + } + } +} + +static void pass_open_abort(struct sock *child, struct sock *parent, + struct sk_buff *skb) +{ + do_abort_syn_rcv(child, parent); + kfree_skb(skb); +} + +static void bl_pass_open_abort(struct sock *lsk, struct sk_buff *skb) +{ + pass_open_abort(skb->sk, lsk, skb); +} + +static void chtls_pass_open_arp_failure(struct sock *sk, + struct sk_buff *skb) +{ + const struct request_sock *oreq; + struct chtls_sock *csk; + struct chtls_dev *cdev; + struct sock *parent; + void *data; + + csk = rcu_dereference_sk_user_data(sk); + cdev = csk->cdev; + + /* + * If the connection is being aborted due to the parent listening + * socket going away there's nothing to do, the ABORT_REQ will close + * the connection. + */ + if (csk_flag(sk, CSK_ABORT_RPL_PENDING)) { + kfree_skb(skb); + return; + } + + oreq = csk->passive_reap_next; + data = lookup_stid(cdev->tids, oreq->ts_recent); + parent = ((struct listen_ctx *)data)->lsk; + + bh_lock_sock(parent); + if (!sock_owned_by_user(parent)) { + pass_open_abort(sk, parent, skb); + } else { + BLOG_SKB_CB(skb)->backlog_rcv = bl_pass_open_abort; + __sk_add_backlog(parent, skb); + } + bh_unlock_sock(parent); +} + +static void chtls_accept_rpl_arp_failure(void *handle, + struct sk_buff *skb) +{ + struct sock *sk = (struct sock *)handle; + + sock_hold(sk); + process_cpl_msg(chtls_pass_open_arp_failure, sk, skb); + sock_put(sk); +} + +static unsigned int chtls_select_mss(const struct chtls_sock *csk, + unsigned int pmtu, + struct cpl_pass_accept_req *req) +{ + struct chtls_dev *cdev; + struct dst_entry *dst; + unsigned int tcpoptsz; + unsigned int iphdrsz; + unsigned int mtu_idx; + struct tcp_sock *tp; + unsigned int mss; + struct sock *sk; + + mss = ntohs(req->tcpopt.mss); + sk = csk->sk; + dst = __sk_dst_get(sk); + cdev = csk->cdev; + tp = tcp_sk(sk); + tcpoptsz = 0; + + iphdrsz = sizeof(struct iphdr) + sizeof(struct tcphdr); + if (req->tcpopt.tstamp) + tcpoptsz += round_up(TCPOLEN_TIMESTAMP, 4); + + tp->advmss = dst_metric_advmss(dst); + if (USER_MSS(tp) && tp->advmss > USER_MSS(tp)) + tp->advmss = USER_MSS(tp); + if (tp->advmss > pmtu - iphdrsz) + tp->advmss = pmtu - iphdrsz; + if (mss && tp->advmss > mss) + tp->advmss = mss; + + tp->advmss = cxgb4_best_aligned_mtu(cdev->lldi->mtus, + iphdrsz + tcpoptsz, + tp->advmss - tcpoptsz, + 8, &mtu_idx); + tp->advmss -= iphdrsz; + + inet_csk(sk)->icsk_pmtu_cookie = pmtu; + return mtu_idx; +} + +static unsigned int select_rcv_wnd(struct chtls_sock *csk) +{ + unsigned int rcvwnd; + unsigned int wnd; + struct sock *sk; + + sk = csk->sk; + wnd = tcp_full_space(sk); + + if (wnd < MIN_RCV_WND) + wnd = MIN_RCV_WND; + + rcvwnd = MAX_RCV_WND; + + csk_set_flag(csk, CSK_UPDATE_RCV_WND); + return min(wnd, rcvwnd); +} + +static unsigned int select_rcv_wscale(int space, int wscale_ok, int win_clamp) +{ + int wscale = 0; + + if (space > MAX_RCV_WND) + space = MAX_RCV_WND; + if (win_clamp && win_clamp < space) + space = win_clamp; + + if (wscale_ok) { + while (wscale < 14 && (65535 << wscale) < space) + wscale++; + } + return wscale; +} + +static void chtls_pass_accept_rpl(struct sk_buff *skb, + struct cpl_pass_accept_req *req, + unsigned int tid) + +{ + struct cpl_t5_pass_accept_rpl *rpl5; + struct cxgb4_lld_info *lldi; + const struct tcphdr *tcph; + const struct tcp_sock *tp; + struct chtls_sock *csk; + unsigned int len; + struct sock *sk; + u32 opt2, hlen; + u64 opt0; + + sk = skb->sk; + tp = tcp_sk(sk); + csk = sk->sk_user_data; + csk->tid = tid; + lldi = csk->cdev->lldi; + len = roundup(sizeof(*rpl5), 16); + + rpl5 = __skb_put_zero(skb, len); + INIT_TP_WR(rpl5, tid); + + OPCODE_TID(rpl5) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, + csk->tid)); + csk->mtu_idx = chtls_select_mss(csk, dst_mtu(__sk_dst_get(sk)), + req); + opt0 = TCAM_BYPASS_F | + WND_SCALE_V((tp)->rx_opt.rcv_wscale) | + MSS_IDX_V(csk->mtu_idx) | + L2T_IDX_V(csk->l2t_entry->idx) | + NAGLE_V(!(tp->nonagle & TCP_NAGLE_OFF)) | + TX_CHAN_V(csk->tx_chan) | + SMAC_SEL_V(csk->smac_idx) | + DSCP_V(csk->tos >> 2) | + ULP_MODE_V(ULP_MODE_TLS) | + RCV_BUFSIZ_V(min(tp->rcv_wnd >> 10, RCV_BUFSIZ_M)); + + opt2 = RX_CHANNEL_V(0) | + RSS_QUEUE_VALID_F | RSS_QUEUE_V(csk->rss_qid); + + if (!is_t5(lldi->adapter_type)) + opt2 |= RX_FC_DISABLE_F; + if (req->tcpopt.tstamp) + opt2 |= TSTAMPS_EN_F; + if (req->tcpopt.sack) + opt2 |= SACK_EN_F; + hlen = ntohl(req->hdr_len); + + tcph = (struct tcphdr *)((u8 *)(req + 1) + + T6_ETH_HDR_LEN_G(hlen) + T6_IP_HDR_LEN_G(hlen)); + if (tcph->ece && tcph->cwr) + opt2 |= CCTRL_ECN_V(1); + opt2 |= CONG_CNTRL_V(CONG_ALG_NEWRENO); + opt2 |= T5_ISS_F; + opt2 |= T5_OPT_2_VALID_F; + rpl5->opt0 = cpu_to_be64(opt0); + rpl5->opt2 = cpu_to_be32(opt2); + rpl5->iss = cpu_to_be32((prandom_u32() & ~7UL) - 1); + set_wr_txq(skb, CPL_PRIORITY_SETUP, csk->port_id); + t4_set_arp_err_handler(skb, sk, chtls_accept_rpl_arp_failure); + cxgb4_l2t_send(csk->egress_dev, skb, csk->l2t_entry); +} + +static void inet_inherit_port(struct inet_hashinfo *hash_info, + struct sock *lsk, struct sock *newsk) +{ + local_bh_disable(); + __inet_inherit_port(lsk, newsk); + local_bh_enable(); +} + +static int chtls_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (skb->protocol) { + kfree_skb(skb); + return 0; + } + BLOG_SKB_CB(skb)->backlog_rcv(sk, skb); + return 0; +} + +static struct sock *chtls_recv_sock(struct sock *lsk, + struct request_sock *oreq, + void *network_hdr, + const struct cpl_pass_accept_req *req, + struct chtls_dev *cdev) +{ + const struct tcphdr *tcph; + struct inet_sock *newinet; + const struct iphdr *iph; + struct net_device *ndev; + struct chtls_sock *csk; + struct dst_entry *dst; + struct neighbour *n; + struct tcp_sock *tp; + struct sock *newsk; + u16 port_id; + int rxq_idx; + int step; + + iph = (const struct iphdr *)network_hdr; + newsk = tcp_create_openreq_child(lsk, oreq, cdev->askb); + if (!newsk) + goto free_oreq; + + dst = inet_csk_route_child_sock(lsk, newsk, oreq); + if (!dst) + goto free_sk; + + tcph = (struct tcphdr *)(iph + 1); + n = dst_neigh_lookup(dst, &iph->saddr); + if (!n) + goto free_sk; + + ndev = n->dev; + if (!ndev) + goto free_dst; + port_id = cxgb4_port_idx(ndev); + + csk = chtls_sock_create(cdev); + if (!csk) + goto free_dst; + + csk->l2t_entry = cxgb4_l2t_get(cdev->lldi->l2t, n, ndev, 0); + if (!csk->l2t_entry) + goto free_csk; + + newsk->sk_user_data = csk; + newsk->sk_backlog_rcv = chtls_backlog_rcv; + + tp = tcp_sk(newsk); + newinet = inet_sk(newsk); + + newinet->inet_daddr = iph->saddr; + newinet->inet_rcv_saddr = iph->daddr; + newinet->inet_saddr = iph->daddr; + + oreq->ts_recent = PASS_OPEN_TID_G(ntohl(req->tos_stid)); + sk_setup_caps(newsk, dst); + csk->sk = newsk; + csk->passive_reap_next = oreq; + csk->tx_chan = cxgb4_port_chan(ndev); + csk->port_id = port_id; + csk->egress_dev = ndev; + csk->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); + csk->ulp_mode = ULP_MODE_TLS; + step = cdev->lldi->nrxq / cdev->lldi->nchan; + csk->rss_qid = cdev->lldi->rxq_ids[port_id * step]; + rxq_idx = port_id * step; + csk->txq_idx = (rxq_idx < cdev->lldi->ntxq) ? rxq_idx : + port_id * step; + csk->sndbuf = newsk->sk_sndbuf; + csk->smac_idx = cxgb4_tp_smt_idx(cdev->lldi->adapter_type, + cxgb4_port_viid(ndev)); + tp->rcv_wnd = select_rcv_wnd(csk); + RCV_WSCALE(tp) = select_rcv_wscale(tcp_full_space(newsk), + WSCALE_OK(tp), + tp->window_clamp); + neigh_release(n); + inet_inherit_port(&tcp_hashinfo, lsk, newsk); + csk_set_flag(csk, CSK_CONN_INLINE); + bh_unlock_sock(newsk); /* tcp_create_openreq_child ->sk_clone_lock */ + + return newsk; +free_csk: + chtls_sock_release(&csk->kref); +free_dst: + dst_release(dst); +free_sk: + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); +free_oreq: + chtls_reqsk_free(oreq); + return NULL; +} + +/* + * Populate a TID_RELEASE WR. The skb must be already propely sized. + */ +static void mk_tid_release(struct sk_buff *skb, + unsigned int chan, unsigned int tid) +{ + struct cpl_tid_release *req; + unsigned int len; + + len = roundup(sizeof(struct cpl_tid_release), 16); + req = (struct cpl_tid_release *)__skb_put(skb, len); + memset(req, 0, len); + set_wr_txq(skb, CPL_PRIORITY_SETUP, chan); + INIT_TP_WR_CPL(req, CPL_TID_RELEASE, tid); +} + +static int chtls_get_module(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + if (!try_module_get(icsk->icsk_ulp_ops->owner)) + return -1; + + return 0; +} + +static void chtls_pass_accept_request(struct sock *sk, + struct sk_buff *skb) +{ + struct cpl_t5_pass_accept_rpl *rpl; + struct cpl_pass_accept_req *req; + struct listen_ctx *listen_ctx; + struct request_sock *oreq; + struct sk_buff *reply_skb; + struct chtls_sock *csk; + struct chtls_dev *cdev; + struct tcphdr *tcph; + struct sock *newsk; + struct ethhdr *eh; + struct iphdr *iph; + void *network_hdr; + unsigned int stid; + unsigned int len; + unsigned int tid; + + req = cplhdr(skb) + RSS_HDR; + tid = GET_TID(req); + cdev = BLOG_SKB_CB(skb)->cdev; + newsk = lookup_tid(cdev->tids, tid); + stid = PASS_OPEN_TID_G(ntohl(req->tos_stid)); + if (newsk) { + pr_info("tid (%d) already in use\n", tid); + return; + } + + len = roundup(sizeof(*rpl), 16); + reply_skb = alloc_skb(len, GFP_ATOMIC); + if (!reply_skb) { + cxgb4_remove_tid(cdev->tids, 0, tid, sk->sk_family); + kfree_skb(skb); + return; + } + + if (sk->sk_state != TCP_LISTEN) + goto reject; + + if (inet_csk_reqsk_queue_is_full(sk)) + goto reject; + + if (sk_acceptq_is_full(sk)) + goto reject; + + oreq = inet_reqsk_alloc(&chtls_rsk_ops, sk, true); + if (!oreq) + goto reject; + + oreq->rsk_rcv_wnd = 0; + oreq->rsk_window_clamp = 0; + oreq->cookie_ts = 0; + oreq->mss = 0; + oreq->ts_recent = 0; + + eh = (struct ethhdr *)(req + 1); + iph = (struct iphdr *)(eh + 1); + if (iph->version != 0x4) + goto free_oreq; + + network_hdr = (void *)(eh + 1); + tcph = (struct tcphdr *)(iph + 1); + + tcp_rsk(oreq)->tfo_listener = false; + tcp_rsk(oreq)->rcv_isn = ntohl(tcph->seq); + chtls_set_req_port(oreq, tcph->source, tcph->dest); + inet_rsk(oreq)->ecn_ok = 0; + chtls_set_req_addr(oreq, iph->daddr, iph->saddr); + if (req->tcpopt.wsf <= 14) { + inet_rsk(oreq)->wscale_ok = 1; + inet_rsk(oreq)->snd_wscale = req->tcpopt.wsf; + } + inet_rsk(oreq)->ir_iif = sk->sk_bound_dev_if; + + newsk = chtls_recv_sock(sk, oreq, network_hdr, req, cdev); + if (!newsk) + goto reject; + + if (chtls_get_module(newsk)) + goto reject; + inet_csk_reqsk_queue_added(sk); + reply_skb->sk = newsk; + chtls_install_cpl_ops(newsk); + cxgb4_insert_tid(cdev->tids, newsk, tid, newsk->sk_family); + csk = rcu_dereference_sk_user_data(newsk); + listen_ctx = (struct listen_ctx *)lookup_stid(cdev->tids, stid); + csk->listen_ctx = listen_ctx; + __skb_queue_tail(&listen_ctx->synq, (struct sk_buff *)&csk->synq); + chtls_pass_accept_rpl(reply_skb, req, tid); + kfree_skb(skb); + return; + +free_oreq: + chtls_reqsk_free(oreq); +reject: + mk_tid_release(reply_skb, 0, tid); + cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); + kfree_skb(skb); +} + +/* + * Handle a CPL_PASS_ACCEPT_REQ message. + */ +static int chtls_pass_accept_req(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_pass_accept_req *req = cplhdr(skb) + RSS_HDR; + struct listen_ctx *ctx; + unsigned int stid; + unsigned int tid; + struct sock *lsk; + void *data; + + stid = PASS_OPEN_TID_G(ntohl(req->tos_stid)); + tid = GET_TID(req); + + data = lookup_stid(cdev->tids, stid); + if (!data) + return 1; + + ctx = (struct listen_ctx *)data; + lsk = ctx->lsk; + + if (unlikely(tid >= cdev->tids->ntids)) { + pr_info("passive open TID %u too large\n", tid); + return 1; + } + + BLOG_SKB_CB(skb)->cdev = cdev; + process_cpl_msg(chtls_pass_accept_request, lsk, skb); + return 0; +} + +/* + * Completes some final bits of initialization for just established connections + * and changes their state to TCP_ESTABLISHED. + * + * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1. + */ +static void make_established(struct sock *sk, u32 snd_isn, unsigned int opt) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->pushed_seq = snd_isn; + tp->write_seq = snd_isn; + tp->snd_nxt = snd_isn; + tp->snd_una = snd_isn; + inet_sk(sk)->inet_id = tp->write_seq ^ jiffies; + assign_rxopt(sk, opt); + + if (tp->rcv_wnd > (RCV_BUFSIZ_M << 10)) + tp->rcv_wup -= tp->rcv_wnd - (RCV_BUFSIZ_M << 10); + + smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); +} + +static void chtls_abort_conn(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *abort_skb; + + abort_skb = alloc_skb(sizeof(struct cpl_abort_req), GFP_ATOMIC); + if (abort_skb) + chtls_send_reset(sk, CPL_ABORT_SEND_RST, abort_skb); +} + +static struct sock *reap_list; +static DEFINE_SPINLOCK(reap_list_lock); + +/* + * Process the reap list. + */ +DECLARE_TASK_FUNC(process_reap_list, task_param) +{ + spin_lock_bh(&reap_list_lock); + while (reap_list) { + struct sock *sk = reap_list; + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + + reap_list = csk->passive_reap_next; + csk->passive_reap_next = NULL; + spin_unlock(&reap_list_lock); + sock_hold(sk); + + bh_lock_sock(sk); + chtls_abort_conn(sk, NULL); + sock_orphan(sk); + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); + bh_unlock_sock(sk); + sock_put(sk); + spin_lock(&reap_list_lock); + } + spin_unlock_bh(&reap_list_lock); +} + +static DECLARE_WORK(reap_task, process_reap_list); + +static void add_to_reap_list(struct sock *sk) +{ + struct chtls_sock *csk = sk->sk_user_data; + + local_bh_disable(); + bh_lock_sock(sk); + release_tcp_port(sk); /* release the port immediately */ + + spin_lock(&reap_list_lock); + csk->passive_reap_next = reap_list; + reap_list = sk; + if (!csk->passive_reap_next) + schedule_work(&reap_task); + spin_unlock(&reap_list_lock); + bh_unlock_sock(sk); + local_bh_enable(); +} + +static void add_pass_open_to_parent(struct sock *child, struct sock *lsk, + struct chtls_dev *cdev) +{ + struct request_sock *oreq; + struct chtls_sock *csk; + + if (lsk->sk_state != TCP_LISTEN) + return; + + csk = child->sk_user_data; + oreq = csk->passive_reap_next; + csk->passive_reap_next = NULL; + + reqsk_queue_removed(&inet_csk(lsk)->icsk_accept_queue, oreq); + __skb_unlink((struct sk_buff *)&csk->synq, &csk->listen_ctx->synq); + + if (sk_acceptq_is_full(lsk)) { + chtls_reqsk_free(oreq); + add_to_reap_list(child); + } else { + refcount_set(&oreq->rsk_refcnt, 1); + inet_csk_reqsk_queue_add(lsk, oreq, child); + lsk->sk_data_ready(lsk); + } +} + +static void bl_add_pass_open_to_parent(struct sock *lsk, struct sk_buff *skb) +{ + struct sock *child = skb->sk; + + skb->sk = NULL; + add_pass_open_to_parent(child, lsk, BLOG_SKB_CB(skb)->cdev); + kfree_skb(skb); +} + +static int chtls_pass_establish(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_pass_establish *req = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk; + struct sock *lsk, *sk; + unsigned int hwtid; + + hwtid = GET_TID(req); + sk = lookup_tid(cdev->tids, hwtid); + if (!sk) + return (CPL_RET_UNKNOWN_TID | CPL_RET_BUF_DONE); + + bh_lock_sock(sk); + if (unlikely(sock_owned_by_user(sk))) { + kfree_skb(skb); + } else { + unsigned int stid; + void *data; + + csk = sk->sk_user_data; + csk->wr_max_credits = 64; + csk->wr_credits = 64; + csk->wr_unacked = 0; + make_established(sk, ntohl(req->snd_isn), ntohs(req->tcp_opt)); + stid = PASS_OPEN_TID_G(ntohl(req->tos_stid)); + sk->sk_state_change(sk); + if (unlikely(sk->sk_socket)) + sk_wake_async(sk, 0, POLL_OUT); + + data = lookup_stid(cdev->tids, stid); + lsk = ((struct listen_ctx *)data)->lsk; + + bh_lock_sock(lsk); + if (unlikely(skb_queue_empty(&csk->listen_ctx->synq))) { + /* removed from synq */ + bh_unlock_sock(lsk); + kfree_skb(skb); + goto unlock; + } + + if (likely(!sock_owned_by_user(lsk))) { + kfree_skb(skb); + add_pass_open_to_parent(sk, lsk, cdev); + } else { + skb->sk = sk; + BLOG_SKB_CB(skb)->cdev = cdev; + BLOG_SKB_CB(skb)->backlog_rcv = + bl_add_pass_open_to_parent; + __sk_add_backlog(lsk, skb); + } + bh_unlock_sock(lsk); + } +unlock: + bh_unlock_sock(sk); + return 0; +} + +/* + * Handle receipt of an urgent pointer. + */ +static void handle_urg_ptr(struct sock *sk, u32 urg_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + + urg_seq--; + if (tp->urg_data && !after(urg_seq, tp->urg_seq)) + return; /* duplicate pointer */ + + sk_send_sigurg(sk); + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && + tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + tp->copied_seq++; + if (skb && tp->copied_seq - ULP_SKB_CB(skb)->seq >= skb->len) + chtls_free_skb(sk, skb); + } + + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = urg_seq; +} + +static void check_sk_callbacks(struct chtls_sock *csk) +{ + struct sock *sk = csk->sk; + + if (unlikely(sk->sk_user_data && + !csk_flag_nochk(csk, CSK_CALLBACKS_CHKD))) + csk_set_flag(csk, CSK_CALLBACKS_CHKD); +} + +/* + * Handles Rx data that arrives in a state where the socket isn't accepting + * new data. + */ +static void handle_excess_rx(struct sock *sk, struct sk_buff *skb) +{ + if (!csk_flag(sk, CSK_ABORT_SHUTDOWN)) + chtls_abort_conn(sk, skb); + + kfree_skb(skb); +} + +static void chtls_recv_data(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_rx_data *hdr = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tp = tcp_sk(sk); + + if (unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) { + handle_excess_rx(sk, skb); + return; + } + + ULP_SKB_CB(skb)->seq = ntohl(hdr->seq); + ULP_SKB_CB(skb)->psh = hdr->psh; + skb_ulp_mode(skb) = ULP_MODE_NONE; + + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(*hdr) + RSS_HDR); + if (!skb->data_len) + __skb_trim(skb, ntohs(hdr->len)); + + if (unlikely(hdr->urg)) + handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg)); + if (unlikely(tp->urg_data == TCP_URG_NOTYET && + tp->urg_seq - tp->rcv_nxt < skb->len)) + tp->urg_data = TCP_URG_VALID | + skb->data[tp->urg_seq - tp->rcv_nxt]; + + if (unlikely(hdr->dack_mode != csk->delack_mode)) { + csk->delack_mode = hdr->dack_mode; + csk->delack_seq = tp->rcv_nxt; + } + + tcp_hdr(skb)->fin = 0; + tp->rcv_nxt += skb->len; + + __skb_queue_tail(&sk->sk_receive_queue, skb); + + if (!sock_flag(sk, SOCK_DEAD)) { + check_sk_callbacks(csk); + sk->sk_data_ready(sk); + } +} + +static int chtls_rx_data(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_rx_data *req = cplhdr(skb) + RSS_HDR; + unsigned int hwtid = GET_TID(req); + struct sock *sk; + + sk = lookup_tid(cdev->tids, hwtid); + skb_dst_set(skb, NULL); + process_cpl_msg(chtls_recv_data, sk, skb); + return 0; +} + +static void chtls_recv_pdu(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_tls_data *hdr = cplhdr(skb); + struct chtls_sock *csk; + struct chtls_hws *tlsk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tlsk = &csk->tlshws; + tp = tcp_sk(sk); + + if (unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) { + handle_excess_rx(sk, skb); + return; + } + + ULP_SKB_CB(skb)->seq = ntohl(hdr->seq); + ULP_SKB_CB(skb)->flags = 0; + skb_ulp_mode(skb) = ULP_MODE_TLS; + + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(*hdr)); + if (!skb->data_len) + __skb_trim(skb, + CPL_TLS_DATA_LENGTH_G(ntohl(hdr->length_pkd))); + + if (unlikely(tp->urg_data == TCP_URG_NOTYET && tp->urg_seq - + tp->rcv_nxt < skb->len)) + tp->urg_data = TCP_URG_VALID | + skb->data[tp->urg_seq - tp->rcv_nxt]; + + tcp_hdr(skb)->fin = 0; + tlsk->pldlen = CPL_TLS_DATA_LENGTH_G(ntohl(hdr->length_pkd)); + __skb_queue_tail(&tlsk->sk_recv_queue, skb); +} + +static int chtls_rx_pdu(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_tls_data *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct sock *sk; + + sk = lookup_tid(cdev->tids, hwtid); + skb_dst_set(skb, NULL); + process_cpl_msg(chtls_recv_pdu, sk, skb); + return 0; +} + +static void chtls_set_hdrlen(struct sk_buff *skb, unsigned int nlen) +{ + struct tlsrx_cmp_hdr *tls_cmp_hdr = cplhdr(skb); + + skb->hdr_len = ntohs((__force __be16)tls_cmp_hdr->length); + tls_cmp_hdr->length = ntohs((__force __be16)nlen); +} + +static void chtls_rx_hdr(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_rx_tls_cmp *cmp_cpl = cplhdr(skb); + struct sk_buff *skb_rec; + struct chtls_sock *csk; + struct chtls_hws *tlsk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tlsk = &csk->tlshws; + tp = tcp_sk(sk); + + ULP_SKB_CB(skb)->seq = ntohl(cmp_cpl->seq); + ULP_SKB_CB(skb)->flags = 0; + + skb_reset_transport_header(skb); + __skb_pull(skb, sizeof(*cmp_cpl)); + if (!skb->data_len) + __skb_trim(skb, CPL_RX_TLS_CMP_LENGTH_G + (ntohl(cmp_cpl->pdulength_length))); + + tp->rcv_nxt += + CPL_RX_TLS_CMP_PDULENGTH_G(ntohl(cmp_cpl->pdulength_length)); + + skb_rec = __skb_dequeue(&tlsk->sk_recv_queue); + if (!skb_rec) { + ULP_SKB_CB(skb)->flags |= ULPCB_FLAG_TLS_ND; + __skb_queue_tail(&sk->sk_receive_queue, skb); + } else { + chtls_set_hdrlen(skb, tlsk->pldlen); + tlsk->pldlen = 0; + __skb_queue_tail(&sk->sk_receive_queue, skb); + __skb_queue_tail(&sk->sk_receive_queue, skb_rec); + } + + if (!sock_flag(sk, SOCK_DEAD)) { + check_sk_callbacks(csk); + sk->sk_data_ready(sk); + } +} + +static int chtls_rx_cmp(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_rx_tls_cmp *req = cplhdr(skb); + unsigned int hwtid = GET_TID(req); + struct sock *sk; + + sk = lookup_tid(cdev->tids, hwtid); + skb_dst_set(skb, NULL); + process_cpl_msg(chtls_rx_hdr, sk, skb); + + return 0; +} + +static void chtls_timewait(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->rcv_nxt++; + tp->rx_opt.ts_recent_stamp = get_seconds(); + tp->srtt_us = 0; + tcp_time_wait(sk, TCP_TIME_WAIT, 0); +} + +static void chtls_peer_close(struct sock *sk, struct sk_buff *skb) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + tcp_set_state(sk, TCP_CLOSE_WAIT); + break; + case TCP_FIN_WAIT1: + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + chtls_release_resources(sk); + if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) + chtls_conn_done(sk); + else + chtls_timewait(sk); + break; + default: + pr_info("cpl_peer_close in bad state %d\n", sk->sk_state); + } + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + /* Do not send POLL_HUP for half duplex close. */ + + if ((sk->sk_shutdown & SEND_SHUTDOWN) || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + else + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + } +} + +static void chtls_close_con_rpl(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_close_con_rpl *rpl = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk; + struct tcp_sock *tp; + + csk = rcu_dereference_sk_user_data(sk); + tp = tcp_sk(sk); + + tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */ + + switch (sk->sk_state) { + case TCP_CLOSING: + chtls_release_resources(sk); + if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) + chtls_conn_done(sk); + else + chtls_timewait(sk); + break; + case TCP_LAST_ACK: + chtls_release_resources(sk); + chtls_conn_done(sk); + break; + case TCP_FIN_WAIT1: + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else if (tcp_sk(sk)->linger2 < 0 && + !csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN)) + chtls_abort_conn(sk, skb); + break; + default: + pr_info("close_con_rpl in bad state %d\n", sk->sk_state); + } + kfree_skb(skb); +} + +static struct sk_buff *get_cpl_skb(struct sk_buff *skb, + size_t len, gfp_t gfp) +{ + if (likely(!skb_is_nonlinear(skb) && !skb_cloned(skb))) { + WARN_ONCE(skb->len < len, "skb alloc error"); + __skb_trim(skb, len); + skb_get(skb); + } else { + skb = alloc_skb(len, gfp); + if (skb) + __skb_put(skb, len); + } + return skb; +} + +static void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid, + int cmd) +{ + struct cpl_abort_rpl *rpl = cplhdr(skb); + + INIT_TP_WR_CPL(rpl, CPL_ABORT_RPL, tid); + rpl->cmd = cmd; +} + +static void send_defer_abort_rpl(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct sk_buff *reply_skb; + + reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl), + GFP_KERNEL | __GFP_NOFAIL); + __skb_put(reply_skb, sizeof(struct cpl_abort_rpl)); + set_abort_rpl_wr(reply_skb, GET_TID(req), + (req->status & CPL_ABORT_NO_RST)); + set_wr_txq(reply_skb, CPL_PRIORITY_DATA, req->status >> 1); + cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); + kfree_skb(skb); +} + +static void send_abort_rpl(struct sock *sk, struct sk_buff *skb, + struct chtls_dev *cdev, int status, int queue) +{ + struct cpl_abort_req_rss *req = cplhdr(skb); + struct sk_buff *reply_skb; + struct chtls_sock *csk; + + csk = rcu_dereference_sk_user_data(sk); + + reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl), + GFP_KERNEL); + + if (!reply_skb) { + req->status = (queue << 1); + send_defer_abort_rpl(cdev, skb); + return; + } + + set_abort_rpl_wr(reply_skb, GET_TID(req), status); + kfree_skb(skb); + + set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue); + if (csk_conn_inline(csk)) { + struct l2t_entry *e = csk->l2t_entry; + + if (e && sk->sk_state != TCP_SYN_RECV) { + cxgb4_l2t_send(csk->egress_dev, reply_skb, e); + return; + } + } + cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); +} + +/* + * Add an skb to the deferred skb queue for processing from process context. + */ +static void t4_defer_reply(struct sk_buff *skb, struct chtls_dev *cdev, + defer_handler_t handler) +{ + DEFERRED_SKB_CB(skb)->handler = handler; + spin_lock_bh(&cdev->deferq.lock); + __skb_queue_tail(&cdev->deferq, skb); + if (skb_queue_len(&cdev->deferq) == 1) + schedule_work(&cdev->deferq_task); + spin_unlock_bh(&cdev->deferq.lock); +} + +static void chtls_send_abort_rpl(struct sock *sk, struct sk_buff *skb, + struct chtls_dev *cdev, + int status, int queue) +{ + struct cpl_abort_req_rss *req = cplhdr(skb) + RSS_HDR; + struct sk_buff *reply_skb; + struct chtls_sock *csk; + unsigned int tid; + + csk = rcu_dereference_sk_user_data(sk); + tid = GET_TID(req); + + reply_skb = get_cpl_skb(skb, sizeof(struct cpl_abort_rpl), gfp_any()); + if (!reply_skb) { + req->status = (queue << 1) | status; + t4_defer_reply(skb, cdev, send_defer_abort_rpl); + return; + } + + set_abort_rpl_wr(reply_skb, tid, status); + set_wr_txq(reply_skb, CPL_PRIORITY_DATA, queue); + if (csk_conn_inline(csk)) { + struct l2t_entry *e = csk->l2t_entry; + + if (e && sk->sk_state != TCP_SYN_RECV) { + cxgb4_l2t_send(csk->egress_dev, reply_skb, e); + return; + } + } + cxgb4_ofld_send(cdev->lldi->ports[0], reply_skb); + kfree_skb(skb); +} + +/* + * This is run from a listener's backlog to abort a child connection in + * SYN_RCV state (i.e., one on the listener's SYN queue). + */ +static void bl_abort_syn_rcv(struct sock *lsk, struct sk_buff *skb) +{ + struct chtls_sock *csk; + struct sock *child; + int queue; + + child = skb->sk; + csk = rcu_dereference_sk_user_data(child); + queue = csk->txq_idx; + + skb->sk = NULL; + do_abort_syn_rcv(child, lsk); + send_abort_rpl(child, skb, BLOG_SKB_CB(skb)->cdev, + CPL_ABORT_NO_RST, queue); +} + +static int abort_syn_rcv(struct sock *sk, struct sk_buff *skb) +{ + const struct request_sock *oreq; + struct listen_ctx *listen_ctx; + struct chtls_sock *csk; + struct chtls_dev *cdev; + struct sock *psk; + void *ctx; + + csk = sk->sk_user_data; + oreq = csk->passive_reap_next; + cdev = csk->cdev; + + if (!oreq) + return -1; + + ctx = lookup_stid(cdev->tids, oreq->ts_recent); + if (!ctx) + return -1; + + listen_ctx = (struct listen_ctx *)ctx; + psk = listen_ctx->lsk; + + bh_lock_sock(psk); + if (!sock_owned_by_user(psk)) { + int queue = csk->txq_idx; + + do_abort_syn_rcv(sk, psk); + send_abort_rpl(sk, skb, cdev, CPL_ABORT_NO_RST, queue); + } else { + skb->sk = sk; + BLOG_SKB_CB(skb)->backlog_rcv = bl_abort_syn_rcv; + __sk_add_backlog(psk, skb); + } + bh_unlock_sock(psk); + return 0; +} + +static void chtls_abort_req_rss(struct sock *sk, struct sk_buff *skb) +{ + const struct cpl_abort_req_rss *req = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk = sk->sk_user_data; + int rst_status = CPL_ABORT_NO_RST; + int queue = csk->txq_idx; + + if (is_neg_adv(req->status)) { + if (sk->sk_state == TCP_SYN_RECV) + chtls_set_tcb_tflag(sk, 0, 0); + + kfree_skb(skb); + return; + } + + csk_reset_flag(csk, CSK_ABORT_REQ_RCVD); + + if (!csk_flag_nochk(csk, CSK_ABORT_SHUTDOWN) && + !csk_flag_nochk(csk, CSK_TX_DATA_SENT)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (send_tx_flowc_wr(sk, 0, tp->snd_nxt, tp->rcv_nxt) < 0) + WARN_ONCE(1, "send_tx_flowc error"); + csk_set_flag(csk, CSK_TX_DATA_SENT); + } + + csk_set_flag(csk, CSK_ABORT_SHUTDOWN); + + if (!csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) { + sk->sk_err = ETIMEDOUT; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + if (sk->sk_state == TCP_SYN_RECV && !abort_syn_rcv(sk, skb)) + return; + + chtls_release_resources(sk); + chtls_conn_done(sk); + } + + chtls_send_abort_rpl(sk, skb, csk->cdev, rst_status, queue); +} + +static void chtls_abort_rpl_rss(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_abort_rpl_rss *rpl = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk; + struct chtls_dev *cdev; + + csk = rcu_dereference_sk_user_data(sk); + cdev = csk->cdev; + + if (csk_flag_nochk(csk, CSK_ABORT_RPL_PENDING)) { + csk_reset_flag(csk, CSK_ABORT_RPL_PENDING); + if (!csk_flag_nochk(csk, CSK_ABORT_REQ_RCVD)) { + if (sk->sk_state == TCP_SYN_SENT) { + cxgb4_remove_tid(cdev->tids, + csk->port_id, + GET_TID(rpl), + sk->sk_family); + sock_put(sk); + } + chtls_release_resources(sk); + chtls_conn_done(sk); + } + } + kfree_skb(skb); +} + +static int chtls_conn_cpl(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_peer_close *req = cplhdr(skb) + RSS_HDR; + void (*fn)(struct sock *sk, struct sk_buff *skb); + unsigned int hwtid = GET_TID(req); + struct sock *sk; + u8 opcode; + + opcode = ((const struct rss_header *)cplhdr(skb))->opcode; + + sk = lookup_tid(cdev->tids, hwtid); + if (!sk) + goto rel_skb; + + switch (opcode) { + case CPL_PEER_CLOSE: + fn = chtls_peer_close; + break; + case CPL_CLOSE_CON_RPL: + fn = chtls_close_con_rpl; + break; + case CPL_ABORT_REQ_RSS: + fn = chtls_abort_req_rss; + break; + case CPL_ABORT_RPL_RSS: + fn = chtls_abort_rpl_rss; + break; + default: + goto rel_skb; + } + + process_cpl_msg(fn, sk, skb); + return 0; + +rel_skb: + kfree_skb(skb); + return 0; +} + +static struct sk_buff *dequeue_wr(struct sock *sk) +{ + struct chtls_sock *csk = rcu_dereference_sk_user_data(sk); + struct sk_buff *skb = csk->wr_skb_head; + + if (likely(skb)) { + /* Don't bother clearing the tail */ + csk->wr_skb_head = WR_SKB_CB(skb)->next_wr; + WR_SKB_CB(skb)->next_wr = NULL; + } + return skb; +} + +static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb) +{ + struct cpl_fw4_ack *hdr = cplhdr(skb) + RSS_HDR; + struct chtls_sock *csk = sk->sk_user_data; + struct tcp_sock *tp = tcp_sk(sk); + u32 credits = hdr->credits; + u32 snd_una; + + snd_una = ntohl(hdr->snd_una); + csk->wr_credits += credits; + + if (csk->wr_unacked > csk->wr_max_credits - csk->wr_credits) + csk->wr_unacked = csk->wr_max_credits - csk->wr_credits; + + while (credits) { + struct sk_buff *pskb = csk->wr_skb_head; + u32 csum; + + if (unlikely(!pskb)) { + if (csk->wr_nondata) + csk->wr_nondata -= credits; + break; + } + csum = (__force u32)pskb->csum; + if (unlikely(credits < csum)) { + pskb->csum = (__force __wsum)(csum - credits); + break; + } + dequeue_wr(sk); + credits -= csum; + kfree_skb(pskb); + } + if (hdr->seq_vld & CPL_FW4_ACK_FLAGS_SEQVAL) { + if (unlikely(before(snd_una, tp->snd_una))) { + kfree_skb(skb); + return; + } + + if (tp->snd_una != snd_una) { + tp->snd_una = snd_una; + tp->rcv_tstamp = tcp_time_stamp(tp); + if (tp->snd_una == tp->snd_nxt && + !csk_flag_nochk(csk, CSK_TX_FAILOVER)) + csk_reset_flag(csk, CSK_TX_WAIT_IDLE); + } + } + + if (hdr->seq_vld & CPL_FW4_ACK_FLAGS_CH) { + unsigned int fclen16 = roundup(failover_flowc_wr_len, 16); + + csk->wr_credits -= fclen16; + csk_reset_flag(csk, CSK_TX_WAIT_IDLE); + csk_reset_flag(csk, CSK_TX_FAILOVER); + } + if (skb_queue_len(&csk->txq) && chtls_push_frames(csk, 0)) + sk->sk_write_space(sk); + + kfree_skb(skb); +} + +static int chtls_wr_ack(struct chtls_dev *cdev, struct sk_buff *skb) +{ + struct cpl_fw4_ack *rpl = cplhdr(skb) + RSS_HDR; + unsigned int hwtid = GET_TID(rpl); + struct sock *sk; + + sk = lookup_tid(cdev->tids, hwtid); + process_cpl_msg(chtls_rx_ack, sk, skb); + + return 0; +} + +chtls_handler_func chtls_handlers[NUM_CPL_CMDS] = { + [CPL_PASS_OPEN_RPL] = chtls_pass_open_rpl, + [CPL_CLOSE_LISTSRV_RPL] = chtls_close_listsrv_rpl, + [CPL_PASS_ACCEPT_REQ] = chtls_pass_accept_req, + [CPL_PASS_ESTABLISH] = chtls_pass_establish, + [CPL_RX_DATA] = chtls_rx_data, + [CPL_TLS_DATA] = chtls_rx_pdu, + [CPL_RX_TLS_CMP] = chtls_rx_cmp, + [CPL_PEER_CLOSE] = chtls_conn_cpl, + [CPL_CLOSE_CON_RPL] = chtls_conn_cpl, + [CPL_ABORT_REQ_RSS] = chtls_conn_cpl, + [CPL_ABORT_RPL_RSS] = chtls_conn_cpl, + [CPL_FW4_ACK] = chtls_wr_ack, +}; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e7e36433cdb5..57b5468b5139 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -332,6 +332,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_update_metrics(sk); tcp_done(sk); } +EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { -- cgit v1.2.3-70-g09d2 From ec1258903a13d0255695a07280ae5e32303aa6c5 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Fri, 30 Mar 2018 20:34:33 +0300 Subject: ip6_gre: remove redundant 'tunnel' setting in ip6erspan_tap_init() 'tunnel' was already set at the start of ip6erspan_tap_init(). Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support") Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 22e86557aca4..f8a103bdbd60 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1762,7 +1762,6 @@ static int ip6erspan_tap_init(struct net_device *dev) dev->mtu -= 8; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); return 0; -- cgit v1.2.3-70-g09d2 From 694aba690de062cf27b28a5e56e7a5a7185b0a1c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 13:16:25 -0700 Subject: ipv4: factorize sk_wmem_alloc updates done by __ip_append_data() While testing my inet defrag changes, I found that the senders could spend ~20% of cpu cycles in skb_set_owner_w() updating sk->sk_wmem_alloc for every fragment they cook. The solution to this problem is to use alloc_skb() instead of sock_wmalloc() and manually perform a single sk_wmem_alloc change. Similar change for IPv6 is provided in following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 66340ab750e6..94cacae76aca 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -876,6 +876,7 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -971,11 +972,10 @@ alloc_new_skb: (flags & MSG_DONTWAIT), &err); } else { skb = NULL; - if (refcount_read(&sk->sk_wmem_alloc) <= + if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, - alloclen + hh_len + 15, 1, - sk->sk_allocation); + skb = alloc_skb(alloclen + hh_len + 15, + sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } @@ -1033,6 +1033,11 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ + if (!skb->destructor) { + skb->destructor = sock_wfree; + skb->sk = sk; + wmem_alloc_delta += skb->truesize; + } __skb_queue_tail(queue, skb); continue; } @@ -1079,12 +1084,13 @@ alloc_new_skb: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); + wmem_alloc_delta += copy; } offset += copy; length -= copy; } + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: @@ -1092,6 +1098,7 @@ error_efault: error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } -- cgit v1.2.3-70-g09d2 From 1f4c6eb24029689a40dceae561e31ff6926d7f0d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 13:16:26 -0700 Subject: ipv6: factorize sk_wmem_alloc updates done by __ip6_append_data() While testing my inet defrag changes, I found that the senders could spend ~20% of cpu cycles in skb_set_owner_w() updating sk->sk_wmem_alloc for every fragment they cook, competing with TX completion of prior skbs possibly happening on another cpus. The solution to this problem is to use alloc_skb() instead of sock_wmalloc() and manually perform a single sk_wmem_alloc change. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2c7f09c3c39e..323d7a354ffb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1259,6 +1259,7 @@ static int __ip6_append_data(struct sock *sk, struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; + unsigned int wmem_alloc_delta = 0; skb = skb_peek_tail(queue); if (!skb) { @@ -1411,11 +1412,10 @@ alloc_new_skb: (flags & MSG_DONTWAIT), &err); } else { skb = NULL; - if (refcount_read(&sk->sk_wmem_alloc) <= + if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) - skb = sock_wmalloc(sk, - alloclen + hh_len, 1, - sk->sk_allocation); + skb = alloc_skb(alloclen + hh_len, + sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } @@ -1474,6 +1474,11 @@ alloc_new_skb: /* * Put the packet on the pending queue */ + if (!skb->destructor) { + skb->destructor = sock_wfree; + skb->sk = sk; + wmem_alloc_delta += skb->truesize; + } __skb_queue_tail(queue, skb); continue; } @@ -1520,12 +1525,13 @@ alloc_new_skb: skb->len += copy; skb->data_len += copy; skb->truesize += copy; - refcount_add(copy, &sk->sk_wmem_alloc); + wmem_alloc_delta += copy; } offset += copy; length -= copy; } + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: @@ -1533,6 +1539,7 @@ error_efault: error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); + refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return err; } -- cgit v1.2.3-70-g09d2 From 9ea471320e1302be0fac67c14a7ab7982609fea7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 30 Mar 2018 16:05:06 -0500 Subject: Bluetooth: Mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Marcel Holtmann --- net/bluetooth/mgmt.c | 1 + net/bluetooth/rfcomm/sock.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6e9fc86d8daf..8a80d48d89c4 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4801,6 +4801,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, case MGMT_LTK_P256_DEBUG: authenticated = 0x00; type = SMP_LTK_P256_DEBUG; + /* fall through */ default: continue; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 93a3b219db09..d606e9212291 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -221,6 +221,7 @@ static void __rfcomm_sock_close(struct sock *sk) case BT_CONFIG: case BT_CONNECTED: rfcomm_dlc_close(d, 0); + /* fall through */ default: sock_set_flag(sk, SOCK_ZAPPED); -- cgit v1.2.3-70-g09d2 From ec1d8ccb07deaf30fd0508af6755364ac47dc08d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 30 Mar 2018 09:44:00 +0800 Subject: vlan: also check phy_driver ts_info for vlan's real device Just like function ethtool_get_ts_info(), we should also consider the phy_driver ts_info call back. For example, driver dp83640. Fixes: 37dd9255b2f6 ("vlan: Pass ethtool get_ts_info queries to real device.") Acked-by: Richard Cochran Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f7e83f6d2e64..236452ebbd9e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -665,8 +666,11 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev, { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; + struct phy_device *phydev = vlan->real_dev->phydev; - if (ops->get_ts_info) { + if (phydev && phydev->drv && phydev->drv->ts_info) { + return phydev->drv->ts_info(phydev, info); + } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | -- cgit v1.2.3-70-g09d2 From da01ec4ee5e69933641e6017d6461d0ce9dbfac6 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 30 Mar 2018 10:11:21 +0800 Subject: net: sched: do not emit messages while holding spinlock move messages emitting out of sch_tree_lock to avoid holding this lock too long. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 1ea9846cc6ce..2a4ab7caf553 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1337,6 +1337,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_opt *hopt; u64 rate64, ceil64; + int warn = 0; /* extract all subattrs from opt attr */ if (!opt) @@ -1499,13 +1500,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, cl->quantum = min_t(u64, quantum, INT_MAX); if (!hopt->quantum && cl->quantum < 1000) { - pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n", - cl->common.classid); + warn = -1; cl->quantum = 1000; } if (!hopt->quantum && cl->quantum > 200000) { - pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n", - cl->common.classid); + warn = 1; cl->quantum = 200000; } if (hopt->quantum) @@ -1519,6 +1518,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, sch_tree_unlock(sch); + if (warn) + pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n", + cl->common.classid, (warn == -1 ? "small" : "big")); + qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; -- cgit v1.2.3-70-g09d2 From 6174a30df1b902e1fedbd728f5343937e83e64e6 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 1 Apr 2018 22:40:35 +0800 Subject: route: check sysctl_fib_multipath_use_neigh earlier than hash Prior to this patch, when one packet is hashed into path [1] (hash <= nh_upper_bound) and it's neigh is dead, it will try path [2]. However, if path [2]'s neigh is alive but it's hash > nh_upper_bound, it will not return this alive path. This packet will never be sent even if path [2] is alive. 3.3.3.1/24: nexthop via 1.1.1.254 dev eth1 weight 1 <--[1] (dead neigh) nexthop via 2.2.2.254 dev eth2 weight 1 <--[2] With sysctl_fib_multipath_use_neigh set is supposed to find an available path respecting to the l3/l4 hash. But if there is no available route with this hash, it should at least return an alive route even with other hash. This patch is to fix it by processing fib_multipath_use_neigh earlier than the hash check, so that it will at least return an alive route if there is when fib_multipath_use_neigh is enabled. It's also compatible with before when there are alive routes with the l3/l4 hash. Fixes: a6db4494d218 ("net: ipv4: Consider failed nexthops in multipath routes") Reported-by: Jianlin Shi Signed-off-by: Xin Long Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e7c602c600ac..c27122f01b87 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1746,18 +1746,20 @@ void fib_select_multipath(struct fib_result *res, int hash) bool first = false; for_nexthops(fi) { + if (net->ipv4.sysctl_fib_multipath_use_neigh) { + if (!fib_good_nh(nh)) + continue; + if (!first) { + res->nh_sel = nhsel; + first = true; + } + } + if (hash > atomic_read(&nh->nh_upper_bound)) continue; - if (!net->ipv4.sysctl_fib_multipath_use_neigh || - fib_good_nh(nh)) { - res->nh_sel = nhsel; - return; - } - if (!first) { - res->nh_sel = nhsel; - first = true; - } + res->nh_sel = nhsel; + return; } endfor_nexthops(fi); } #endif -- cgit v1.2.3-70-g09d2 From 6e00f7dd5e4edc2443f030b226f66fe4f1267667 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 1 Apr 2018 21:57:59 -0700 Subject: ipv6: frags: fix /proc/sys/net/ipv6/ip6frag_low_thresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I forgot to change ip6frag_low_thresh proc_handler from proc_dointvec_minmax to proc_doulongvec_minmax Fixes: 3e67f106f619 ("inet: frags: break the 2GB limit for frags storage") Signed-off-by: Eric Dumazet Reported-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/ipv6/reassembly.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 7b52efc63f6a..70e4a578b2fb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -564,7 +564,7 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv6.frags.high_thresh }, -- cgit v1.2.3-70-g09d2