summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.rst29
-rw-r--r--include/net/inet_hashtables.h6
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/ipv4/inet_hashtables.c47
-rw-r--r--net/ipv4/sysctl_net_ipv4.c39
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_ipv4.c38
-rw-r--r--net/ipv4/tcp_minisocks.c9
9 files changed, 164 insertions, 8 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a759872a2883..e7b3fa7bb3f7 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1040,6 +1040,35 @@ tcp_challenge_ack_limit - INTEGER
TCP stack implements per TCP socket limits anyway.
Default: INT_MAX (unlimited)
+tcp_ehash_entries - INTEGER
+ Show the number of hash buckets for TCP sockets in the current
+ networking namespace.
+
+ A negative value means the networking namespace does not own its
+ hash buckets and shares the initial networking namespace's one.
+
+tcp_child_ehash_entries - INTEGER
+ Control the number of hash buckets for TCP sockets in the child
+ networking namespace, which must be set before clone() or unshare().
+
+ If the value is not 0, the kernel uses a value rounded up to 2^n
+ as the actual hash bucket size. 0 is a special value, meaning
+ the child networking namespace will share the initial networking
+ namespace's hash buckets.
+
+ Note that the child will use the global one in case the kernel
+ fails to allocate enough memory. In addition, the global hash
+ buckets are spread over available NUMA nodes, but the allocation
+ of the child hash table depends on the current process's NUMA
+ policy, which could result in performance differences.
+
+ Note also that the default value of tcp_max_tw_buckets and
+ tcp_max_syn_backlog depend on the hash bucket size.
+
+ Possible values: 0, 2^n (n: 0 - 24 (16Mi))
+
+ Default: 0
+
UDP variables
=============
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 520dd894b73d..9121ccab1fa1 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -168,6 +168,8 @@ struct inet_hashinfo {
/* The 2nd listener table hashed by local port and address */
unsigned int lhash2_mask;
struct inet_listen_hashbucket *lhash2;
+
+ bool pernet;
};
static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
@@ -214,6 +216,10 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
hashinfo->ehash_locks = NULL;
}
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries);
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo);
+
struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 2c7df93e3403..1b8004679445 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -171,6 +171,7 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ca_ratio;
int sysctl_tcp_wmem[3];
int sysctl_tcp_rmem[3];
+ unsigned int sysctl_tcp_child_ehash_entries;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
int sysctl_max_syn_backlog;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 7cd4a6cc99fc..c548ca3e9b0e 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1197,6 +1197,8 @@ static int __init dccp_init(void)
INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}
+ dccp_hashinfo.pernet = false;
+
rc = dccp_mib_init();
if (rc)
goto out_free_dccp_bhash2;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c440de998910..74e64aad5114 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1145,3 +1145,50 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
return 0;
}
EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
+
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries)
+{
+ struct inet_hashinfo *new_hashinfo;
+ int i;
+
+ new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
+ if (!new_hashinfo)
+ goto err;
+
+ new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
+ GFP_KERNEL_ACCOUNT);
+ if (!new_hashinfo->ehash)
+ goto free_hashinfo;
+
+ new_hashinfo->ehash_mask = ehash_entries - 1;
+
+ if (inet_ehash_locks_alloc(new_hashinfo))
+ goto free_ehash;
+
+ for (i = 0; i < ehash_entries; i++)
+ INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
+
+ new_hashinfo->pernet = true;
+
+ return new_hashinfo;
+
+free_ehash:
+ vfree(new_hashinfo->ehash);
+free_hashinfo:
+ kfree(new_hashinfo);
+err:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
+
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
+{
+ if (!hashinfo->pernet)
+ return;
+
+ inet_ehash_locks_free(hashinfo);
+ vfree(hashinfo->ehash);
+ kfree(hashinfo);
+}
+EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4d7c110c772f..9b8a6db7a66b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -39,6 +39,7 @@ static u32 u32_max_div_HZ = UINT_MAX / HZ;
static int one_day_secs = 24 * 3600;
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -382,6 +383,29 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl,
return ret;
}
+static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_child_ehash_entries);
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+ int tcp_ehash_entries;
+ struct ctl_table tbl;
+
+ tcp_ehash_entries = hinfo->ehash_mask + 1;
+
+ /* A negative number indicates that the child netns
+ * shares the global ehash.
+ */
+ if (!net_eq(net, &init_net) && !hinfo->pernet)
+ tcp_ehash_entries *= -1;
+
+ tbl.data = &tcp_ehash_entries;
+ tbl.maxlen = sizeof(int);
+
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
void *buffer, size_t *lenp,
@@ -1321,6 +1345,21 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_ONE,
},
{
+ .procname = "tcp_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .mode = 0444,
+ .proc_handler = proc_tcp_ehash_entries,
+ },
+ {
+ .procname = "tcp_child_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_child_ehash_entries_max,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8230be00ecca..829beee3fa32 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4790,6 +4790,7 @@ void __init tcp_init(void)
INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
+ tcp_hashinfo.pernet = false;
cnt = tcp_hashinfo.ehash_mask + 1;
sysctl_tcp_max_orphans = cnt / 2;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 73e6854ea662..6376ad915765 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3110,10 +3110,38 @@ static void __net_exit tcp_sk_exit(struct net *net)
net->ipv4.tcp_congestion_control->owner);
}
-static int __net_init tcp_sk_init(struct net *net)
+static void __net_init tcp_set_hashinfo(struct net *net)
{
- int cnt;
+ struct inet_hashinfo *hinfo;
+ unsigned int ehash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
+ if (!ehash_entries)
+ goto fallback;
+
+ ehash_entries = roundup_pow_of_two(ehash_entries);
+ hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
+ if (!hinfo) {
+ pr_warn("Failed to allocate TCP ehash (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ ehash_entries);
+fallback:
+ hinfo = &tcp_hashinfo;
+ ehash_entries = tcp_hashinfo.ehash_mask + 1;
+ }
+
+ net->ipv4.tcp_death_row.hashinfo = hinfo;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
+ net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
+}
+static int __net_init tcp_sk_init(struct net *net)
+{
net->ipv4.sysctl_tcp_ecn = 2;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
@@ -3140,11 +3168,8 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
- cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
- net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+ tcp_set_hashinfo(net);
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -3209,6 +3234,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
tcp_twsk_purge(net_exit_list, AF_INET);
list_for_each_entry(net, net_exit_list, exit_list) {
+ inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
tcp_fastopen_ctx_destroy(net);
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 98c576d4b671..442838ab0253 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list, int family)
{
+ bool purged_once = false;
struct net *net;
list_for_each_entry(net, net_exit_list, exit_list) {
@@ -356,8 +357,12 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family)
if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1)
continue;
- inet_twsk_purge(&tcp_hashinfo, family);
- break;
+ if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
+ } else if (!purged_once) {
+ inet_twsk_purge(&tcp_hashinfo, family);
+ purged_once = true;
+ }
}
}
EXPORT_SYMBOL_GPL(tcp_twsk_purge);