summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c26
-rw-r--r--net/core/devlink.c44
-rw-r--r--net/core/dst_cache.c4
-rw-r--r--net/core/ethtool.c64
-rw-r--r--net/core/fib_notifier.c1
-rw-r--r--net/core/fib_rules.c105
-rw-r--r--net/core/filter.c277
-rw-r--r--net/core/flow_dissector.c16
-rw-r--r--net/core/net-procfs.c2
-rw-r--r--net/core/net_namespace.c139
-rw-r--r--net/core/pktgen.c16
-rw-r--r--net/core/rtnetlink.c11
-rw-r--r--net/core/skbuff.c10
-rw-r--r--net/core/sock.c90
-rw-r--r--net/core/sock_diag.c1
-rw-r--r--net/core/sysctl_net_core.c13
16 files changed, 685 insertions, 134 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 12be20535714..f9c28f44286c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2378,7 +2378,7 @@ EXPORT_SYMBOL(netdev_set_num_tc);
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
- * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
*/
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
@@ -4359,6 +4359,9 @@ int netdev_rx_handler_register(struct net_device *dev,
if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
+ if (dev->priv_flags & IFF_NO_RX_HANDLER)
+ return -EINVAL;
+
/* Note: rx_handler_data must be set before rx_handler */
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
@@ -7554,6 +7557,19 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ /* LRO/HW-GRO features cannot be combined with RX-FCS */
+ if (features & NETIF_F_RXFCS) {
+ if (features & NETIF_F_LRO) {
+ netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if (features & NETIF_F_GRO_HW) {
+ netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
+ features &= ~NETIF_F_GRO_HW;
+ }
+ }
+
return features;
}
@@ -8010,7 +8026,8 @@ int register_netdev(struct net_device *dev)
{
int err;
- rtnl_lock();
+ if (rtnl_lock_killable())
+ return -EINTR;
err = register_netdevice(dev);
rtnl_unlock();
return err;
@@ -8153,8 +8170,9 @@ void netdev_run_todo(void)
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+#if IS_ENABLED(CONFIG_DECNET)
WARN_ON(dev->dn_ptr);
-
+#endif
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
@@ -8852,6 +8870,7 @@ static void __net_exit netdev_exit(struct net *net)
static struct pernet_operations __net_initdata netdev_net_ops = {
.init = netdev_init,
.exit = netdev_exit,
+ .async = true,
};
static void __net_exit default_device_exit(struct net *net)
@@ -8952,6 +8971,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
static struct pernet_operations __net_initdata default_device_ops = {
.exit = default_device_exit,
.exit_batch = default_device_exit_batch,
+ .async = true,
};
/*
diff --git a/net/core/devlink.c b/net/core/devlink.c
index effd4848c2b4..9236e421bd62 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2331,6 +2331,32 @@ out:
resource->size_valid = size_valid;
}
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+ struct netlink_ext_ack *extack)
+{
+ u64 reminder;
+ int err = 0;
+
+ if (size > resource->size_params.size_max) {
+ NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
+ err = -EINVAL;
+ }
+
+ if (size < resource->size_params.size_min) {
+ NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
+ err = -EINVAL;
+ }
+
+ div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
+ if (reminder) {
+ NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
struct genl_info *info)
{
@@ -2349,12 +2375,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
if (!resource)
return -EINVAL;
- if (!resource->resource_ops->size_validate)
- return -EINVAL;
-
size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
- err = resource->resource_ops->size_validate(devlink, size,
- info->extack);
+ err = devlink_resource_validate_size(resource, size, info->extack);
if (err)
return err;
@@ -2714,22 +2736,22 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
.doit = devlink_nl_cmd_dpipe_table_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
.doit = devlink_nl_cmd_dpipe_entries_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
.doit = devlink_nl_cmd_dpipe_headers_get,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
@@ -2749,8 +2771,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.cmd = DEVLINK_CMD_RESOURCE_DUMP,
.doit = devlink_nl_cmd_resource_dump,
.policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
},
{
.cmd = DEVLINK_CMD_RELOAD,
@@ -3144,7 +3166,6 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
*/
int devlink_resource_register(struct devlink *devlink,
const char *resource_name,
- bool top_hierarchy,
u64 resource_size,
u64 resource_id,
u64 parent_resource_id,
@@ -3153,8 +3174,11 @@ int devlink_resource_register(struct devlink *devlink,
{
struct devlink_resource *resource;
struct list_head *resource_list;
+ bool top_hierarchy;
int err = 0;
+ top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
mutex_lock(&devlink->lock);
resource = devlink_resource_find(devlink, NULL, resource_id);
if (resource) {
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..64cef977484a 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
#if IS_ENABLED(CONFIG_IPV6)
void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
- const struct in6_addr *addr)
+ const struct in6_addr *saddr)
{
struct dst_cache_pcpu *idst;
@@ -117,7 +117,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
rt6_get_cookie((struct rt6_info *)dst));
- idst->in6_saddr = *addr;
+ idst->in6_saddr = *saddr;
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3f89c76d5c24..157cd9efa4be 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1022,6 +1022,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
+ /* If FLOW_RSS was requested then user-space must be using the
+ * new definition, as FLOW_RSS is newer.
+ */
+ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+ info_size = sizeof(info);
+ if (copy_from_user(&info, useraddr, info_size))
+ return -EFAULT;
+ }
+
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
if (info.rule_cnt > 0) {
if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
@@ -1251,9 +1260,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
user_key_size = rxfh.key_size;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
- rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->get_rxfh_context)
+ return -EOPNOTSUPP;
rxfh.indir_size = dev_indir_size;
rxfh.key_size = dev_key_size;
@@ -1276,7 +1287,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
if (user_key_size)
hkey = rss_config + indir_bytes;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+ if (rxfh.rss_context)
+ ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
+ &dev_hfunc,
+ rxfh.rss_context);
+ else
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
if (ret)
goto out;
@@ -1306,6 +1322,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
u8 *hkey = NULL;
u8 *rss_config;
u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+ bool delete = false;
if (!ops->get_rxnfc || !ops->set_rxfh)
return -EOPNOTSUPP;
@@ -1319,9 +1336,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
return -EFAULT;
/* Check that reserved fields are 0 for now */
- if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
- rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
return -EINVAL;
+ /* Most drivers don't handle rss_context, check it's 0 as well */
+ if (rxfh.rss_context && !ops->set_rxfh_context)
+ return -EOPNOTSUPP;
/* If either indir, hash key or function is valid, proceed further.
* Must request at least one change: indir size, hash key or function.
@@ -1346,7 +1365,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (ret)
goto out;
- /* rxfh.indir_size == 0 means reset the indir table to default.
+ /* rxfh.indir_size == 0 means reset the indir table to default (master
+ * context) or delete the context (other RSS contexts).
* rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
*/
if (rxfh.indir_size &&
@@ -1359,9 +1379,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (ret)
goto out;
} else if (rxfh.indir_size == 0) {
- indir = (u32 *)rss_config;
- for (i = 0; i < dev_indir_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ if (rxfh.rss_context == 0) {
+ indir = (u32 *)rss_config;
+ for (i = 0; i < dev_indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ delete = true;
+ }
}
if (rxfh.key_size) {
@@ -1374,15 +1398,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
}
- ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (rxfh.rss_context)
+ ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
+ &rxfh.rss_context, delete);
+ else
+ ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
if (ret)
goto out;
- /* indicate whether rxfh was set to default */
- if (rxfh.indir_size == 0)
- dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
- else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
- dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
+ &rxfh.rss_context, sizeof(rxfh.rss_context)))
+ ret = -EFAULT;
+
+ if (!rxfh.rss_context) {
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+ }
out:
kfree(rss_config);
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..5ace0705a3f9 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -171,6 +171,7 @@ static void __net_exit fib_notifier_net_exit(struct net *net)
static struct pernet_operations fib_notifier_net_ops = {
.init = fib_notifier_net_init,
.exit = fib_notifier_net_exit,
+ .async = true,
};
static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98e1066c3d55..f6f04fc0f629 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
!uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
return false;
+ if (fib_rule_port_range_set(&rule->sport_range))
+ return false;
+ if (fib_rule_port_range_set(&rule->dport_range))
+ return false;
return true;
}
EXPORT_SYMBOL_GPL(fib_rule_matchall);
@@ -51,6 +55,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->pref = pref;
r->table = table;
r->flags = flags;
+ r->proto = RTPROT_KERNEL;
r->fr_net = ops->fro_net;
r->uid_range = fib_kuid_range_unset;
@@ -220,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
}
+static int nla_get_port_range(struct nlattr *pattr,
+ struct fib_rule_port_range *port_range)
+{
+ const struct fib_rule_port_range *pr = nla_data(pattr);
+
+ if (!fib_rule_port_range_valid(pr))
+ return -EINVAL;
+
+ port_range->start = pr->start;
+ port_range->end = pr->end;
+
+ return 0;
+}
+
+static int nla_put_port_range(struct sk_buff *skb, int attrtype,
+ struct fib_rule_port_range *range)
+{
+ return nla_put(skb, attrtype, sizeof(*range), range);
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
@@ -424,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
!uid_eq(r->uid_range.end, rule->uid_range.end))
continue;
+ if (r->ip_proto != rule->ip_proto)
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->sport_range,
+ &rule->sport_range))
+ continue;
+
+ if (!fib_rule_port_range_compare(&r->dport_range,
+ &rule->dport_range))
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -469,6 +505,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
: fib_default_rule_pref(ops);
+ rule->proto = tb[FRA_PROTOCOL] ?
+ nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
@@ -565,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
rule->uid_range = fib_kuid_range_unset;
}
+ if (tb[FRA_IP_PROTO])
+ rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &rule->sport_range);
+ if (err)
+ goto errout_free;
+ }
+
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &rule->dport_range);
+ if (err)
+ goto errout_free;
+ }
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -630,6 +686,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ struct fib_rule_port_range sprange = {0, 0};
+ struct fib_rule_port_range dprange = {0, 0};
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *r;
struct nlattr *tb[FRA_MAX+1];
@@ -663,7 +721,25 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
range = fib_kuid_range_unset;
}
+ if (tb[FRA_SPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+ &sprange);
+ if (err)
+ goto errout;
+ }
+
+ if (tb[FRA_DPORT_RANGE]) {
+ err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+ &dprange);
+ if (err)
+ goto errout;
+ }
+
list_for_each_entry(rule, &ops->rules_list, list) {
+ if (tb[FRA_PROTOCOL] &&
+ (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
+ continue;
+
if (frh->action && (frh->action != rule->action))
continue;
@@ -704,6 +780,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
!uid_eq(rule->uid_range.end, range.end)))
continue;
+ if (tb[FRA_IP_PROTO] &&
+ (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
+ continue;
+
+ if (fib_rule_port_range_set(&sprange) &&
+ !fib_rule_port_range_compare(&rule->sport_range, &sprange))
+ continue;
+
+ if (fib_rule_port_range_set(&dprange) &&
+ !fib_rule_port_range_compare(&rule->dport_range, &dprange))
+ continue;
+
if (!ops->compare(rule, frh, tb))
continue;
@@ -781,7 +869,11 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
+ nla_total_size_64bit(8) /* FRA_TUN_ID */
- + nla_total_size(sizeof(struct fib_kuid_range));
+ + nla_total_size(sizeof(struct fib_kuid_range))
+ + nla_total_size(1) /* FRA_PROTOCOL */
+ + nla_total_size(1) /* FRA_IP_PROTO */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -812,6 +904,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh->action = rule->action;
frh->flags = rule->flags;
+ if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
+ goto nla_put_failure;
+
if (rule->action == FR_ACT_GOTO &&
rcu_access_pointer(rule->ctarget) == NULL)
frh->flags |= FIB_RULE_UNRESOLVED;
@@ -843,7 +938,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
(rule->l3mdev &&
nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
(uid_range_set(&rule->uid_range) &&
- nla_put_uid_range(skb, &rule->uid_range)))
+ nla_put_uid_range(skb, &rule->uid_range)) ||
+ (fib_rule_port_range_set(&rule->sport_range) &&
+ nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (fib_rule_port_range_set(&rule->dport_range) &&
+ nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
@@ -1030,6 +1130,7 @@ static void __net_exit fib_rules_net_exit(struct net *net)
static struct pernet_operations fib_rules_net_ops = {
.init = fib_rules_net_init,
.exit = fib_rules_net_exit,
+ .async = true,
};
static int __init fib_rules_init(void)
diff --git a/net/core/filter.c b/net/core/filter.c
index 48aa7c7320db..00c711c5f1a2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1890,6 +1890,202 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ /* If user passes invalid input drop the packet. */
+ if (unlikely(flags))
+ return SK_DROP;
+
+ msg->key = key;
+ msg->flags = flags;
+ msg->map = map;
+
+ return SK_PASS;
+}
+
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+ struct sock *sk = NULL;
+
+ if (msg->map) {
+ sk = __sock_map_lookup_elem(msg->map, msg->key);
+
+ msg->key = 0;
+ msg->map = NULL;
+ }
+
+ return sk;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->apply_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+ .func = bpf_msg_apply_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+ msg->cork_bytes = bytes;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+ .func = bpf_msg_cork_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_pull_data,
+ struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+ unsigned int len = 0, offset = 0, copy = 0;
+ struct scatterlist *sg = msg->sg_data;
+ int first_sg, last_sg, i, shift;
+ unsigned char *p, *to, *from;
+ int bytes = end - start;
+ struct page *page;
+
+ if (unlikely(flags || end <= start))
+ return -EINVAL;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg_start;
+ do {
+ len = sg[i].length;
+ offset += len;
+ if (start < offset + len)
+ break;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != msg->sg_end);
+
+ if (unlikely(start >= offset + len))
+ return -EINVAL;
+
+ if (!msg->sg_copy[i] && bytes <= len)
+ goto out;
+
+ first_sg = i;
+
+ /* At this point we need to linearize multiple scatterlist
+ * elements or a single shared page. Either way we need to
+ * copy into a linear buffer exclusively owned by BPF. Then
+ * place the buffer in the scatterlist and fixup the original
+ * entries by removing the entries now in the linear buffer
+ * and shifting the remaining entries. For now we do not try
+ * to copy partial entries to avoid complexity of running out
+ * of sg_entry slots. The downside is reading a single byte
+ * will copy the entire sg entry.
+ */
+ do {
+ copy += sg[i].length;
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ if (bytes < copy)
+ break;
+ } while (i != msg->sg_end);
+ last_sg = i;
+
+ if (unlikely(copy < end - start))
+ return -EINVAL;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+ if (unlikely(!page))
+ return -ENOMEM;
+ p = page_address(page);
+ offset = 0;
+
+ i = first_sg;
+ do {
+ from = sg_virt(&sg[i]);
+ len = sg[i].length;
+ to = p + offset;
+
+ memcpy(to, from, len);
+ offset += len;
+ sg[i].length = 0;
+ put_page(sg_page(&sg[i]));
+
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (i != last_sg);
+
+ sg[first_sg].length = copy;
+ sg_set_page(&sg[first_sg], page, copy, 0);
+
+ /* To repair sg ring we need to shift entries. If we only
+ * had a single entry though we can just replace it and
+ * be done. Otherwise walk the ring and shift the entries.
+ */
+ shift = last_sg - first_sg - 1;
+ if (!shift)
+ goto out;
+
+ i = first_sg + 1;
+ do {
+ int move_from;
+
+ if (i + shift >= MAX_SKB_FRAGS)
+ move_from = i + shift - MAX_SKB_FRAGS;
+ else
+ move_from = i + shift;
+
+ if (move_from == msg->sg_end)
+ break;
+
+ sg[i] = sg[move_from];
+ sg[move_from].length = 0;
+ sg[move_from].page_link = 0;
+ sg[move_from].offset = 0;
+
+ i++;
+ if (i == MAX_SKB_FRAGS)
+ i = 0;
+ } while (1);
+ msg->sg_end -= shift;
+ if (msg->sg_end < 0)
+ msg->sg_end += MAX_SKB_FRAGS;
+out:
+ msg->data = sg_virt(&sg[i]) + start - offset;
+ msg->data_end = msg->data + bytes;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+ .func = bpf_msg_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -2855,7 +3051,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta)
+ func == bpf_xdp_adjust_meta ||
+ func == bpf_msg_pull_data)
return true;
return false;
@@ -3015,7 +3212,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
struct ip_tunnel_info *info;
if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
- BPF_F_DONT_FRAGMENT)))
+ BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
@@ -3049,6 +3246,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
if (flags & BPF_F_ZERO_CSUM_TX)
info->key.tun_flags &= ~TUNNEL_CSUM;
+ if (flags & BPF_F_SEQ_NUMBER)
+ info->key.tun_flags |= TUNNEL_SEQ;
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -3613,6 +3812,22 @@ static const struct bpf_func_proto *
}
}
+static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_msg_redirect_map:
+ return &bpf_msg_redirect_map_proto;
+ case BPF_FUNC_msg_apply_bytes:
+ return &bpf_msg_apply_bytes_proto;
+ case BPF_FUNC_msg_cork_bytes:
+ return &bpf_msg_cork_bytes_proto;
+ case BPF_FUNC_msg_pull_data:
+ return &bpf_msg_pull_data_proto;
+ default:
+ return bpf_base_func_proto(func_id);
+ }
+}
+
static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -4002,6 +4217,32 @@ static bool sk_skb_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, info);
}
+static bool sk_msg_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case offsetof(struct sk_msg_md, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ if (off < 0 || off >= sizeof(struct sk_msg_md))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+
+ return true;
+}
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -4800,6 +5041,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
return insn - insn_buf;
}
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct sk_msg_md, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data));
+ break;
+ case offsetof(struct sk_msg_md, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_buff, data_end));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
const struct bpf_verifier_ops sk_filter_verifier_ops = {
.get_func_proto = sk_filter_func_proto,
.is_valid_access = sk_filter_is_valid_access,
@@ -4890,6 +5154,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = {
const struct bpf_prog_ops sk_skb_prog_ops = {
};
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+ .get_func_proto = sk_msg_func_proto,
+ .is_valid_access = sk_msg_is_valid_access,
+ .convert_ctx_access = sk_msg_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 559db9ea8d86..d29f09bc5ff9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
}
EXPORT_SYMBOL(__get_hash_from_flowi6);
-__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
-{
- memset(keys, 0, sizeof(*keys));
-
- keys->addrs.v4addrs.src = fl4->saddr;
- keys->addrs.v4addrs.dst = fl4->daddr;
- keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
- keys->ports.src = fl4->fl4_sport;
- keys->ports.dst = fl4->fl4_dport;
- keys->keyid.keyid = fl4->fl4_gre_key;
- keys->basic.ip_proto = fl4->flowi4_proto;
-
- return flow_hash_from_keys(keys);
-}
-EXPORT_SYMBOL(__get_hash_from_flowi4);
-
static const struct flow_dissector_key flow_keys_dissector_keys[] = {
{
.key_id = FLOW_DISSECTOR_KEY_CONTROL,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index e010bb800d7b..65b51e778782 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -349,6 +349,7 @@ static void __net_exit dev_proc_net_exit(struct net *net)
static struct pernet_operations __net_initdata dev_proc_ops = {
.init = dev_proc_net_init,
.exit = dev_proc_net_exit,
+ .async = true,
};
static int dev_mc_seq_show(struct seq_file *seq, void *v)
@@ -405,6 +406,7 @@ static void __net_exit dev_mc_net_exit(struct net *net)
static struct pernet_operations __net_initdata dev_mc_net_ops = {
.init = dev_mc_net_init,
.exit = dev_mc_net_exit,
+ .async = true,
};
int __init dev_proc_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3cad5f51afd3..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,7 +29,6 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -41,6 +40,12 @@ struct net init_net = {
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+static unsigned nr_sync_pernet_ops;
+/*
+ * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
+ * init_net_initialized and first_device pointer.
+ */
+DECLARE_RWSEM(net_sem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -65,11 +70,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
- BUG_ON(!mutex_is_locked(&net_mutex));
BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
- lockdep_is_held(&net_mutex));
+ lockdep_is_held(&net_sem));
if (old_ng->s.len > id) {
old_ng->ptr[id] = data;
return 0;
@@ -286,7 +290,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
*/
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
- /* Must be called with net_mutex held */
+ /* Must be called with net_sem held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
@@ -297,12 +301,16 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
net->user_ns = user_ns;
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
+ mutex_init(&net->ipv4.ra_mutex);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
if (error < 0)
goto out_undo;
}
+ rtnl_lock();
+ list_add_tail_rcu(&net->list, &net_namespace_list);
+ rtnl_unlock();
out:
return error;
@@ -331,6 +339,7 @@ static int __net_init net_defaults_init_net(struct net *net)
static struct pernet_operations net_defaults_ops = {
.init = net_defaults_init_net,
+ .async = true,
};
static __init int net_defaults_init(void)
@@ -354,7 +363,7 @@ static void dec_net_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}
-static struct kmem_cache *net_cachep;
+static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;
static struct net *net_alloc(void)
@@ -397,6 +406,7 @@ struct net *copy_net_ns(unsigned long flags,
{
struct ucounts *ucounts;
struct net *net;
+ unsigned write;
int rv;
if (!(flags & CLONE_NEWNET))
@@ -408,32 +418,38 @@ struct net *copy_net_ns(unsigned long flags,
net = net_alloc();
if (!net) {
- dec_net_namespaces(ucounts);
- return ERR_PTR(-ENOMEM);
+ rv = -ENOMEM;
+ goto dec_ucounts;
}
-
+ refcount_set(&net->passive, 1);
+ net->ucounts = ucounts;
get_user_ns(user_ns);
+again:
+ write = READ_ONCE(nr_sync_pernet_ops);
+ if (write)
+ rv = down_write_killable(&net_sem);
+ else
+ rv = down_read_killable(&net_sem);
+ if (rv < 0)
+ goto put_userns;
- rv = mutex_lock_killable(&net_mutex);
- if (rv < 0) {
- net_free(net);
- dec_net_namespaces(ucounts);
- put_user_ns(user_ns);
- return ERR_PTR(rv);
+ if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
+ up_read(&net_sem);
+ goto again;
}
-
- net->ucounts = ucounts;
rv = setup_net(net, user_ns);
- if (rv == 0) {
- rtnl_lock();
- list_add_tail_rcu(&net->list, &net_namespace_list);
- rtnl_unlock();
- }
- mutex_unlock(&net_mutex);
+
+ if (write)
+ up_write(&net_sem);
+ else
+ up_read(&net_sem);
+
if (rv < 0) {
- dec_net_namespaces(ucounts);
+put_userns:
put_user_ns(user_ns);
net_drop_ns(net);
+dec_ucounts:
+ dec_net_namespaces(ucounts);
return ERR_PTR(rv);
}
return net;
@@ -466,26 +482,33 @@ static void unhash_nsid(struct net *net, struct net *last)
spin_unlock_bh(&net->nsid_lock);
}
-static DEFINE_SPINLOCK(cleanup_list_lock);
-static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
+static LLIST_HEAD(cleanup_list);
static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp, *last;
- struct list_head net_kill_list;
+ struct llist_node *net_kill_list;
LIST_HEAD(net_exit_list);
+ unsigned write;
/* Atomically snapshot the list of namespaces to cleanup */
- spin_lock_irq(&cleanup_list_lock);
- list_replace_init(&cleanup_list, &net_kill_list);
- spin_unlock_irq(&cleanup_list_lock);
+ net_kill_list = llist_del_all(&cleanup_list);
+again:
+ write = READ_ONCE(nr_sync_pernet_ops);
+ if (write)
+ down_write(&net_sem);
+ else
+ down_read(&net_sem);
- mutex_lock(&net_mutex);
+ if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
+ up_read(&net_sem);
+ goto again;
+ }
/* Don't let anyone else find us. */
rtnl_lock();
- list_for_each_entry(net, &net_kill_list, cleanup_list)
+ llist_for_each_entry(net, net_kill_list, cleanup_list)
list_del_rcu(&net->list);
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
@@ -500,7 +523,7 @@ static void cleanup_net(struct work_struct *work)
last = list_last_entry(&net_namespace_list, struct net, list);
rtnl_unlock();
- list_for_each_entry(net, &net_kill_list, cleanup_list) {
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
unhash_nsid(net, last);
list_add_tail(&net->exit_list, &net_exit_list);
}
@@ -520,7 +543,10 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
- mutex_unlock(&net_mutex);
+ if (write)
+ up_write(&net_sem);
+ else
+ up_read(&net_sem);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
@@ -547,8 +573,8 @@ static void cleanup_net(struct work_struct *work)
*/
void net_ns_barrier(void)
{
- mutex_lock(&net_mutex);
- mutex_unlock(&net_mutex);
+ down_write(&net_sem);
+ up_write(&net_sem);
}
EXPORT_SYMBOL(net_ns_barrier);
@@ -557,13 +583,8 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
{
/* Cleanup the network namespace in process context */
- unsigned long flags;
-
- spin_lock_irqsave(&cleanup_list_lock, flags);
- list_add(&net->cleanup_list, &cleanup_list);
- spin_unlock_irqrestore(&cleanup_list_lock, flags);
-
- queue_work(netns_wq, &net_cleanup_work);
+ if (llist_add(&net->cleanup_list, &cleanup_list))
+ queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);
@@ -633,6 +654,7 @@ static __net_exit void net_ns_net_exit(struct net *net)
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
.exit = net_ns_net_exit,
+ .async = true,
};
static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -861,7 +883,7 @@ static int __init net_ns_init(void)
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
- SLAB_PANIC, NULL);
+ SLAB_PANIC|SLAB_ACCOUNT, NULL);
/* Create workqueue for cleanup */
netns_wq = create_singlethread_workqueue("netns");
@@ -875,17 +897,12 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
-
- rtnl_lock();
- list_add_tail_rcu(&init_net.list, &net_namespace_list);
- rtnl_unlock();
-
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
register_pernet_subsys(&net_ns_ops);
@@ -989,6 +1006,9 @@ again:
rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
+ } else if (!ops->async) {
+ pr_info_once("Pernet operations %ps are sync.\n", ops);
+ nr_sync_pernet_ops++;
}
return error;
@@ -996,7 +1016,8 @@ again:
static void unregister_pernet_operations(struct pernet_operations *ops)
{
-
+ if (!ops->async)
+ BUG_ON(nr_sync_pernet_ops-- == 0);
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)
@@ -1025,9 +1046,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(first_device, ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1043,9 +1064,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
*/
void unregister_pernet_subsys(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
@@ -1071,11 +1092,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
int register_pernet_device(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1091,11 +1112,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b8ab5c829511..545cf08cd558 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -906,13 +906,14 @@ static ssize_t pktgen_if_write(struct file *file,
i += len;
if (debug) {
- size_t copy = min_t(size_t, count, 1023);
- char tb[copy + 1];
- if (copy_from_user(tb, user_buffer, copy))
- return -EFAULT;
- tb[copy] = 0;
- pr_debug("%s,%lu buffer -:%s:-\n",
- name, (unsigned long)count, tb);
+ size_t copy = min_t(size_t, count + 1, 1024);
+ char *tp = strndup_user(user_buffer, copy);
+
+ if (IS_ERR(tp))
+ return PTR_ERR(tp);
+
+ pr_debug("%s,%zu buffer -:%s:-\n", name, count, tp);
+ kfree(tp);
}
if (!strcmp(name, "min_pkt_size")) {
@@ -3851,6 +3852,7 @@ static struct pernet_operations pg_net_ops = {
.exit = pg_net_exit,
.id = &pg_net_id,
.size = sizeof(struct pktgen_net),
+ .async = true,
};
static int __init pg_init(void)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bc290413a49d..87079eaa871b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -75,6 +75,12 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+int rtnl_lock_killable(void)
+{
+ return mutex_lock_killable(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock_killable);
+
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
{
@@ -454,11 +460,11 @@ static void rtnl_lock_unregistering_all(void)
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
/* Close the race with cleanup_net() */
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
@@ -4724,6 +4730,7 @@ static void __net_exit rtnetlink_net_exit(struct net *net)
static struct pernet_operations rtnetlink_net_ops = {
.init = rtnetlink_net_init,
.exit = rtnetlink_net_exit,
+ .async = true,
};
void __init rtnetlink_init(void)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e7acdc30732..46cb22215ff4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -77,8 +77,8 @@
#include <linux/capability.h>
#include <linux/user_namespace.h>
-struct kmem_cache *skbuff_head_cache __read_mostly;
-static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __ro_after_init;
+static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);
-static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
unsigned long max_pg, num_pg, new_pg, old_pg;
struct user_struct *user;
@@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
return 0;
}
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
-static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
if (mmp->user) {
atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
free_uid(mmp->user);
}
}
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
{
diff --git a/net/core/sock.c b/net/core/sock.c
index 85b0b64e7f9d..e689496dfd8a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1049,16 +1049,18 @@ set_rcvbuf:
break;
case SO_ZEROCOPY:
- if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ ret = -ENOTSUPP;
+ } else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
- else if (sk->sk_protocol != IPPROTO_TCP)
- ret = -ENOTSUPP;
- else if (sk->sk_state != TCP_CLOSE)
- ret = -EBUSY;
- else if (val < 0 || val > 1)
- ret = -EINVAL;
- else
- sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
+ if (!ret) {
+ if (val < 0 || val > 1)
+ ret = -EINVAL;
+ else
+ sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+ }
break;
default:
@@ -1274,7 +1276,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
{
char address[128];
- if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+ lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+ if (lv < 0)
return -ENOTCONN;
if (lv < len)
return -EINVAL;
@@ -1773,7 +1776,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
u32 max_segs = 1;
sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features;
+ sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
sk->sk_route_caps &= ~sk->sk_route_nocaps;
@@ -2234,6 +2237,67 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
}
EXPORT_SYMBOL(sk_page_frag_refill);
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+ int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
+ int first_coalesce)
+{
+ int sg_curr = *sg_curr_index, use = 0, rc = 0;
+ unsigned int size = *sg_curr_size;
+ struct page_frag *pfrag;
+ struct scatterlist *sge;
+
+ len -= size;
+ pfrag = sk_page_frag(sk);
+
+ while (len > 0) {
+ unsigned int orig_offset;
+
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ use = min_t(int, len, pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, use)) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ sk_mem_charge(sk, use);
+ size += use;
+ orig_offset = pfrag->offset;
+ pfrag->offset += use;
+
+ sge = sg + sg_curr - 1;
+ if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
+ sg->offset + sg->length == orig_offset) {
+ sg->length += use;
+ } else {
+ sge = sg + sg_curr;
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sg_curr++;
+
+ if (sg_curr == MAX_SKB_FRAGS)
+ sg_curr = 0;
+
+ if (sg_curr == sg_start) {
+ rc = -ENOSPC;
+ break;
+ }
+ }
+
+ len -= use;
+ }
+out:
+ *sg_curr_size = size;
+ *sg_curr_index = sg_curr;
+ return rc;
+}
+EXPORT_SYMBOL(sk_alloc_sg);
+
static void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
@@ -2497,7 +2561,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
EXPORT_SYMBOL(sock_no_accept);
int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
- int *len, int peer)
+ int peer)
{
return -EOPNOTSUPP;
}
@@ -3111,6 +3175,7 @@ static void __net_exit sock_inuse_exit_net(struct net *net)
static struct pernet_operations net_inuse_ops = {
.init = sock_inuse_init_net,
.exit = sock_inuse_exit_net,
+ .async = true,
};
static __init int net_inuse_init(void)
@@ -3405,6 +3470,7 @@ static __net_exit void proto_exit_net(struct net *net)
static __net_initdata struct pernet_operations proto_net_ops = {
.init = proto_init_net,
.exit = proto_exit_net,
+ .async = true,
};
static int __init proto_init(void)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index c37b5be7c5e4..a3392a8f9276 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -324,6 +324,7 @@ static void __net_exit diag_net_exit(struct net *net)
static struct pernet_operations diag_net_ops = {
.init = diag_net_init,
.exit = diag_net_exit,
+ .async = true,
};
static int __init sock_diag_init(void)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f2d0462611c3..4f47f92459cc 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "fb_tunnels_only_for_init_net",
+ .data = &sysctl_fb_tunnels_only_for_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
@@ -572,6 +584,7 @@ static __net_exit void sysctl_core_net_exit(struct net *net)
static __net_initdata struct pernet_operations sysctl_core_ops = {
.init = sysctl_core_net_init,
.exit = sysctl_core_net_exit,
+ .async = true,
};
static __init int sysctl_core_init(void)