From ff1c08e1f74b6864854c39be48aa799a6a2e4d2b Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 29 Oct 2019 16:43:07 +0100 Subject: bpf: Change size to u64 for bpf_map_{area_alloc, charge_init}() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions bpf_map_area_alloc() and bpf_map_charge_init() prior this commit passed the size parameter as size_t. In this commit this is changed to u64. All users of these functions avoid size_t overflows on 32-bit systems, by explicitly using u64 when calculating the allocation size and memory charge cost. However, since the result was narrowed by the size_t when passing size and cost to the functions, the overflow handling was in vain. Instead of changing all call sites to size_t and handle overflow at the call site, the parameter is changed to u64 and checked in the functions above. Fixes: d407bd25a204 ("bpf: don't trigger OOM killer under pressure with map alloc") Fixes: c85d69135a91 ("bpf: move memory size checks to bpf_map_charge_init()") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20191029154307.23053-1-bjorn.topel@gmail.com --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b9d22338606..3bf3835d0e86 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -656,11 +656,11 @@ void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size); +int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); void bpf_map_charge_finish(struct bpf_map_memory *mem); void bpf_map_charge_move(struct bpf_map_memory *dst, struct bpf_map_memory *src); -void *bpf_map_area_alloc(size_t size, int numa_node); +void *bpf_map_area_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); -- cgit v1.2.3-70-g09d2 From 797060ec427c83ce4a64a0278a1e6077dfed683a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 1 Nov 2019 22:21:54 -0400 Subject: radix tree: Remove radix_tree_iter_find This API is unsafe to use under the RCU lock. With no in-tree users remaining, remove it to prevent future bugs. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/radix-tree.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index b5116013f27e..63e62372443a 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -315,24 +315,6 @@ radix_tree_iter_lookup(const struct radix_tree_root *root, return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG); } -/** - * radix_tree_iter_find - find a present entry - * @root: radix tree root - * @iter: iterator state - * @index: start location - * - * This function returns the slot containing the entry with the lowest index - * which is at least @index. If @index is larger than any present entry, this - * function returns NULL. The @iter is updated to describe the entry found. - */ -static inline void __rcu ** -radix_tree_iter_find(const struct radix_tree_root *root, - struct radix_tree_iter *iter, unsigned long index) -{ - radix_tree_iter_init(iter, index); - return radix_tree_next_chunk(root, iter, 0); -} - /** * radix_tree_iter_retry - retry this chunk of the iteration * @iter: iterator state -- cgit v1.2.3-70-g09d2 From f6341c5af4e6e15041be39976d16deca789555fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 3 Nov 2019 06:36:43 -0500 Subject: idr: Fix integer overflow in idr_for_each_entry If there is an entry at INT_MAX then idr_for_each_entry() will increment id after handling it. This is undefined behaviour, and is caught by UBSAN. Adding 1U to id forces the operation to be carried out as an unsigned addition which (when assigned to id) will result in INT_MIN. Since there is never an entry stored at INT_MIN, idr_get_next() will return NULL, ending the loop as expected. Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/idr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index ee7abae143d3..dc09bd646bcb 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -185,7 +185,7 @@ static inline void idr_preload_end(void) * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ - for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; ++id) + for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. -- cgit v1.2.3-70-g09d2 From 250367c59e6ba0d79d702a059712d66edacd4a1a Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 31 Oct 2019 11:06:24 +0100 Subject: netfilter: nf_tables: Align nft_expr private data to 64-bit Invoking the following commands on a 32-bit architecture with strict alignment requirements (such as an ARMv7-based Raspberry Pi) results in an alignment exception: # nft add table ip test-ip4 # nft add chain ip test-ip4 output { type filter hook output priority 0; } # nft add rule ip test-ip4 output quota 1025 bytes Alignment trap: not handling instruction e1b26f9f at [<7f4473f8>] Unhandled fault: alignment exception (0x001) at 0xb832e824 Internal error: : 1 [#1] PREEMPT SMP ARM Hardware name: BCM2835 [<7f4473fc>] (nft_quota_do_init [nft_quota]) [<7f447448>] (nft_quota_init [nft_quota]) [<7f4260d0>] (nf_tables_newrule [nf_tables]) [<7f4168dc>] (nfnetlink_rcv_batch [nfnetlink]) [<7f416bd0>] (nfnetlink_rcv [nfnetlink]) [<8078b334>] (netlink_unicast) [<8078b664>] (netlink_sendmsg) [<8071b47c>] (sock_sendmsg) [<8071bd18>] (___sys_sendmsg) [<8071ce3c>] (__sys_sendmsg) [<8071ce94>] (sys_sendmsg) The reason is that nft_quota_do_init() calls atomic64_set() on an atomic64_t which is only aligned to 32-bit, not 64-bit, because it succeeds struct nft_expr in memory which only contains a 32-bit pointer. Fix by aligning the nft_expr private data to 64-bit. Fixes: 96518518cc41 ("netfilter: add nftables") Signed-off-by: Lukas Wunner Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 001d294edf57..2d0275f13bbf 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -820,7 +820,8 @@ struct nft_expr_ops { */ struct nft_expr { const struct nft_expr_ops *ops; - unsigned char data[]; + unsigned char data[] + __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) -- cgit v1.2.3-70-g09d2 From 3926a3a025d443f6b7a58a2c0c33e7d77c1ca935 Mon Sep 17 00:00:00 2001 From: Yegor Yefremov Date: Thu, 19 Sep 2019 15:53:04 +0200 Subject: can: don't use deprecated license identifiers The "GPL-2.0" license identifier changed to "GPL-2.0-only" in SPDX v3.0. Signed-off-by: Yegor Yefremov Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can.h | 2 +- include/uapi/linux/can/bcm.h | 2 +- include/uapi/linux/can/error.h | 2 +- include/uapi/linux/can/gw.h | 2 +- include/uapi/linux/can/j1939.h | 2 +- include/uapi/linux/can/netlink.h | 2 +- include/uapi/linux/can/raw.h | 2 +- include/uapi/linux/can/vxcan.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h index 1e988fdeba34..6a6d2c7655ff 100644 --- a/include/uapi/linux/can.h +++ b/include/uapi/linux/can.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * linux/can.h * diff --git a/include/uapi/linux/can/bcm.h b/include/uapi/linux/can/bcm.h index 0fb328d93148..dd2b925b09ac 100644 --- a/include/uapi/linux/can/bcm.h +++ b/include/uapi/linux/can/bcm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * linux/can/bcm.h * diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h index bfc4b5d22a5e..34633283de64 100644 --- a/include/uapi/linux/can/error.h +++ b/include/uapi/linux/can/error.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * linux/can/error.h * diff --git a/include/uapi/linux/can/gw.h b/include/uapi/linux/can/gw.h index 3aea5388c8e4..c2190bbe21d8 100644 --- a/include/uapi/linux/can/gw.h +++ b/include/uapi/linux/can/gw.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * linux/can/gw.h * diff --git a/include/uapi/linux/can/j1939.h b/include/uapi/linux/can/j1939.h index c32325342d30..df6e821075c1 100644 --- a/include/uapi/linux/can/j1939.h +++ b/include/uapi/linux/can/j1939.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * j1939.h * diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 1bc70d3a4d39..6f598b73839e 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* * linux/can/netlink.h * diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index be3b36e7ff61..6a11d308eb5c 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * linux/can/raw.h * diff --git a/include/uapi/linux/can/vxcan.h b/include/uapi/linux/can/vxcan.h index 066812d118a2..4fa9d8777a07 100644 --- a/include/uapi/linux/can/vxcan.h +++ b/include/uapi/linux/can/vxcan.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ #ifndef _UAPI_CAN_VXCAN_H #define _UAPI_CAN_VXCAN_H -- cgit v1.2.3-70-g09d2 From fa729c4df558936b4a1a7b3e2234011f44ede28b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 31 Oct 2019 12:36:08 +0100 Subject: clone3: validate stack arguments Validate the stack arguments and setup the stack depening on whether or not it is growing down or up. Legacy clone() required userspace to know in which direction the stack is growing and pass down the stack pointer appropriately. To make things more confusing microblaze uses a variant of the clone() syscall selected by CONFIG_CLONE_BACKWARDS3 that takes an additional stack_size argument. IA64 has a separate clone2() syscall which also takes an additional stack_size argument. Finally, parisc has a stack that is growing upwards. Userspace therefore has a lot nasty code like the following: #define __STACK_SIZE (8 * 1024 * 1024) pid_t sys_clone(int (*fn)(void *), void *arg, int flags, int *pidfd) { pid_t ret; void *stack; stack = malloc(__STACK_SIZE); if (!stack) return -ENOMEM; #ifdef __ia64__ ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #elif defined(__parisc__) /* stack grows up */ ret = clone(fn, stack, flags | SIGCHLD, arg, pidfd); #else ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, pidfd); #endif return ret; } or even crazier variants such as [3]. With clone3() we have the ability to validate the stack. We can check that when stack_size is passed, the stack pointer is valid and the other way around. We can also check that the memory area userspace gave us is fine to use via access_ok(). Furthermore, we probably should not require userspace to know in which direction the stack is growing. It is easy for us to do this in the kernel and I couldn't find the original reasoning behind exposing this detail to userspace. /* Intentional user visible API change */ clone3() was released with 5.3. Currently, it is not documented and very unclear to userspace how the stack and stack_size argument have to be passed. After talking to glibc folks we concluded that trying to change clone3() to setup the stack instead of requiring userspace to do this is the right course of action. Note, that this is an explicit change in user visible behavior we introduce with this patch. If it breaks someone's use-case we will revert! (And then e.g. place the new behavior under an appropriate flag.) Breaking someone's use-case is very unlikely though. First, neither glibc nor musl currently expose a wrapper for clone3(). Second, there is no real motivation for anyone to use clone3() directly since it does not provide features that legacy clone doesn't. New features for clone3() will first happen in v5.5 which is why v5.4 is still a good time to try and make that change now and backport it to v5.3. Searches on [4] did not reveal any packages calling clone3(). [1]: https://lore.kernel.org/r/CAG48ez3q=BeNcuVTKBN79kJui4vC6nw0Bfq6xc-i0neheT17TA@mail.gmail.com [2]: https://lore.kernel.org/r/20191028172143.4vnnjpdljfnexaq5@wittgenstein [3]: https://github.com/systemd/systemd/blob/5238e9575906297608ff802a27e2ff9effa3b338/src/basic/raw-clone.h#L31 [4]: https://codesearch.debian.net Fixes: 7f192e3cd316 ("fork: add clone3") Cc: Kees Cook Cc: Jann Horn Cc: David Howells Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Florian Weimer Cc: Peter Zijlstra Cc: linux-api@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: # 5.3 Cc: GNU C Library Signed-off-by: Christian Brauner Acked-by: Arnd Bergmann Acked-by: Aleksa Sarai Link: https://lore.kernel.org/r/20191031113608.20713-1-christian.brauner@ubuntu.com --- include/uapi/linux/sched.h | 4 ++++ kernel/fork.c | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 99335e1f4a27..25b4fa00bad1 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -51,6 +51,10 @@ * sent when the child exits. * @stack: Specify the location of the stack for the * child process. + * Note, @stack is expected to point to the + * lowest address. The stack direction will be + * determined by the kernel and set up + * appropriately based on @stack_size. * @stack_size: The size of the stack for the child process. * @tls: If CLONE_SETTLS is set, the tls descriptor * is set to tls. diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..55af6931c6ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2561,7 +2561,35 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, return 0; } -static bool clone3_args_valid(const struct kernel_clone_args *kargs) +/** + * clone3_stack_valid - check and prepare stack + * @kargs: kernel clone args + * + * Verify that the stack arguments userspace gave us are sane. + * In addition, set the stack direction for userspace since it's easy for us to + * determine. + */ +static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) +{ + if (kargs->stack == 0) { + if (kargs->stack_size > 0) + return false; + } else { + if (kargs->stack_size == 0) + return false; + + if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) + return false; + +#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64) + kargs->stack += kargs->stack_size; +#endif + } + + return true; +} + +static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* * All lower bits of the flag word are taken. @@ -2581,6 +2609,9 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) kargs->exit_signal) return false; + if (!clone3_stack_valid(kargs)) + return false; + return true; } -- cgit v1.2.3-70-g09d2 From 0d6eeb1fd625272bd60d25f2d5e116cf582fc7dc Mon Sep 17 00:00:00 2001 From: Charles Machalow Date: Mon, 4 Nov 2019 22:15:10 -0800 Subject: nvme: change nvme_passthru_cmd64 to explicitly mark rsvd Changing nvme_passthru_cmd64 to add a field: rsvd2. This field is an explicit marker for the padding space added on certain platforms as a result of the enlargement of the result field from 32 bit to 64 bits in size, and fixes differences in struct size when using compat ioctl for 32-bit binaries on 64-bit architecture. Fixes: 65e68edce0db ("nvme: allow 64-bit results in passthru commands") Reviewed-by: Christoph Hellwig Signed-off-by: Charles Machalow [changelog] Signed-off-by: Keith Busch --- include/uapi/linux/nvme_ioctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index e168dc59e9a0..d99b5a772698 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -63,6 +63,7 @@ struct nvme_passthru_cmd64 { __u32 cdw14; __u32 cdw15; __u32 timeout_ms; + __u32 rsvd2; __u64 result; }; -- cgit v1.2.3-70-g09d2 From 1899bb325149e481de31a4f32b59ea6f24e176ea Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Fri, 1 Nov 2019 21:56:42 -0700 Subject: bonding: fix state transition issue in link monitoring Since de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring"), the bonding driver has utilized two separate variables to indicate the next link state a particular slave should transition to. Each is used to communicate to a different portion of the link state change commit logic; one to the bond_miimon_commit function itself, and another to the state transition logic. Unfortunately, the two variables can become unsynchronized, resulting in incorrect link state transitions within bonding. This can cause slaves to become stuck in an incorrect link state until a subsequent carrier state transition. The issue occurs when a special case in bond_slave_netdev_event sets slave->link directly to BOND_LINK_FAIL. On the next pass through bond_miimon_inspect after the slave goes carrier up, the BOND_LINK_FAIL case will set the proposed next state (link_new_state) to BOND_LINK_UP, but the new_link to BOND_LINK_DOWN. The setting of the final link state from new_link comes after that from link_new_state, and so the slave will end up incorrectly in _DOWN state. Resolve this by combining the two variables into one. Reported-by: Aleksei Zakharov Reported-by: Sha Zhang Cc: Mahesh Bandewar Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 44 ++++++++++++++++++++--------------------- include/net/bonding.h | 3 +-- 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 480f9459b402..62f65573eb04 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2083,8 +2083,7 @@ static int bond_miimon_inspect(struct bonding *bond) ignore_updelay = !rcu_dereference(bond->curr_active_slave); bond_for_each_slave_rcu(bond, slave, iter) { - slave->new_link = BOND_LINK_NOCHANGE; - slave->link_new_state = slave->link; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = bond_check_dev_link(bond, slave->dev, 0); @@ -2118,7 +2117,7 @@ static int bond_miimon_inspect(struct bonding *bond) } if (slave->delay <= 0) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } @@ -2155,7 +2154,7 @@ static int bond_miimon_inspect(struct bonding *bond) slave->delay = 0; if (slave->delay <= 0) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; @@ -2193,7 +2192,7 @@ static void bond_miimon_commit(struct bonding *bond) struct slave *slave, *primary; bond_for_each_slave(bond, slave, iter) { - switch (slave->new_link) { + switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after @@ -2265,8 +2264,8 @@ static void bond_miimon_commit(struct bonding *bond) default: slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n", - slave->new_link); - slave->new_link = BOND_LINK_NOCHANGE; + slave->link_new_state); + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } @@ -2674,13 +2673,13 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) bond_for_each_slave_rcu(bond, slave, iter) { unsigned long trans_start = dev_trans_start(slave->dev); - slave->new_link = BOND_LINK_NOCHANGE; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, trans_start, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin @@ -2705,7 +2704,7 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) if (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, slave->last_rx, 2)) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) @@ -2736,8 +2735,8 @@ static void bond_loadbalance_arp_mon(struct bonding *bond) goto re_arm; bond_for_each_slave(bond, slave, iter) { - if (slave->new_link != BOND_LINK_NOCHANGE) - slave->link = slave->new_link; + if (slave->link_new_state != BOND_LINK_NOCHANGE) + slave->link = slave->link_new_state; } if (slave_state_changed) { @@ -2760,9 +2759,9 @@ re_arm: } /* Called to inspect slaves for active-backup mode ARP monitor link state - * changes. Sets new_link in slaves to specify what action should take - * place for the slave. Returns 0 if no changes are found, >0 if changes - * to link states must be committed. + * changes. Sets proposed link state in slaves to specify what action + * should take place for the slave. Returns 0 if no changes are found, >0 + * if changes to link states must be committed. * * Called with rcu_read_lock held. */ @@ -2774,12 +2773,12 @@ static int bond_ab_arp_inspect(struct bonding *bond) int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { - slave->new_link = BOND_LINK_NOCHANGE; + bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { - slave->new_link = BOND_LINK_UP; + bond_propose_link_state(slave, BOND_LINK_UP); commit++; } continue; @@ -2807,7 +2806,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, 3)) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } @@ -2820,7 +2819,7 @@ static int bond_ab_arp_inspect(struct bonding *bond) if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, trans_start, 2) || !bond_time_in_interval(bond, last_rx, 2))) { - slave->new_link = BOND_LINK_DOWN; + bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } @@ -2840,7 +2839,7 @@ static void bond_ab_arp_commit(struct bonding *bond) struct slave *slave; bond_for_each_slave(bond, slave, iter) { - switch (slave->new_link) { + switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; @@ -2890,8 +2889,9 @@ static void bond_ab_arp_commit(struct bonding *bond) continue; default: - slave_err(bond->dev, slave->dev, "impossible: new_link %d on slave\n", - slave->new_link); + slave_err(bond->dev, slave->dev, + "impossible: link_new_state %d on slave\n", + slave->link_new_state); continue; } diff --git a/include/net/bonding.h b/include/net/bonding.h index 1afc125014da..3d56b026bb9e 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -159,7 +159,6 @@ struct slave { unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 link_new_state; /* one of BOND_LINK_XXXX */ - s8 new_link; u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ inactive:1, /* indicates inactive slave */ @@ -549,7 +548,7 @@ static inline void bond_propose_link_state(struct slave *slave, int state) static inline void bond_commit_link_state(struct slave *slave, bool notify) { - if (slave->link == slave->link_new_state) + if (slave->link_new_state == BOND_LINK_NOCHANGE) return; slave->link = slave->link_new_state; -- cgit v1.2.3-70-g09d2 From 59eb87cb52c9f7164804bc8639c4d03ba9b0c169 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Sat, 2 Nov 2019 14:17:47 +0000 Subject: net: sched: prevent duplicate flower rules from tcf_proto destroy race When a new filter is added to cls_api, the function tcf_chain_tp_insert_unique() looks up the protocol/priority/chain to determine if the tcf_proto is duplicated in the chain's hashtable. It then creates a new entry or continues with an existing one. In cls_flower, this allows the function fl_ht_insert_unque to determine if a filter is a duplicate and reject appropriately, meaning that the duplicate will not be passed to drivers via the offload hooks. However, when a tcf_proto is destroyed it is removed from its chain before a hardware remove hook is hit. This can lead to a race whereby the driver has not received the remove message but duplicate flows can be accepted. This, in turn, can lead to the offload driver receiving incorrect duplicate flows and out of order add/delete messages. Prevent duplicates by utilising an approach suggested by Vlad Buslov. A hash table per block stores each unique chain/protocol/prio being destroyed. This entry is only removed when the full destroy (and hardware offload) has completed. If a new flow is being added with the same identiers as a tc_proto being detroyed, then the add request is replayed until the destroy is complete. Fixes: 8b64678e0af8 ("net: sched: refactor tp insert/delete for concurrent execution") Signed-off-by: John Hurley Signed-off-by: Vlad Buslov Reviewed-by: Simon Horman Reported-by: Louis Peens Signed-off-by: David S. Miller --- include/net/sch_generic.h | 4 +++ net/sched/cls_api.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 83 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 637548d54b3e..d80acda231ae 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -362,6 +363,7 @@ struct tcf_proto { bool deleting; refcount_t refcnt; struct rcu_head rcu; + struct hlist_node destroy_ht_node; }; struct qdisc_skb_cb { @@ -414,6 +416,8 @@ struct tcf_block { struct list_head filter_chain_list; } chain0; struct rcu_head rcu; + DECLARE_HASHTABLE(proto_destroy_ht, 7); + struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; #ifdef CONFIG_PROVE_LOCKING diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8717c0b26c90..20d60b8fcb70 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,62 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +static u32 destroy_obj_hashfn(const struct tcf_proto *tp) +{ + return jhash_3words(tp->chain->index, tp->prio, + (__force __u32)tp->protocol, 0); +} + +static void tcf_proto_signal_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + hash_add_rcu(block->proto_destroy_ht, &tp->destroy_ht_node, + destroy_obj_hashfn(tp)); + mutex_unlock(&block->proto_destroy_lock); +} + +static bool tcf_proto_cmp(const struct tcf_proto *tp1, + const struct tcf_proto *tp2) +{ + return tp1->chain->index == tp2->chain->index && + tp1->prio == tp2->prio && + tp1->protocol == tp2->protocol; +} + +static bool tcf_proto_exists_destroying(struct tcf_chain *chain, + struct tcf_proto *tp) +{ + u32 hash = destroy_obj_hashfn(tp); + struct tcf_proto *iter; + bool found = false; + + rcu_read_lock(); + hash_for_each_possible_rcu(chain->block->proto_destroy_ht, iter, + destroy_ht_node, hash) { + if (tcf_proto_cmp(tp, iter)) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + +static void +tcf_proto_signal_destroyed(struct tcf_chain *chain, struct tcf_proto *tp) +{ + struct tcf_block *block = chain->block; + + mutex_lock(&block->proto_destroy_lock); + if (hash_hashed(&tp->destroy_ht_node)) + hash_del_rcu(&tp->destroy_ht_node); + mutex_unlock(&block->proto_destroy_lock); +} + /* Find classifier type by string name */ static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) @@ -234,9 +291,11 @@ static void tcf_proto_get(struct tcf_proto *tp) static void tcf_chain_put(struct tcf_chain *chain); static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) + bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); + if (sig_destroy) + tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); module_put(tp->ops->owner); kfree_rcu(tp, rcu); @@ -246,7 +305,7 @@ static void tcf_proto_put(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { if (refcount_dec_and_test(&tp->refcnt)) - tcf_proto_destroy(tp, rtnl_held, extack); + tcf_proto_destroy(tp, rtnl_held, true, extack); } static int walker_check_empty(struct tcf_proto *tp, void *fh, @@ -370,6 +429,7 @@ static bool tcf_chain_detach(struct tcf_chain *chain) static void tcf_block_destroy(struct tcf_block *block) { mutex_destroy(&block->lock); + mutex_destroy(&block->proto_destroy_lock); kfree_rcu(block, rcu); } @@ -545,6 +605,12 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) mutex_lock(&chain->filter_chain_lock); tp = tcf_chain_dereference(chain->filter_chain, chain); + while (tp) { + tp_next = rcu_dereference_protected(tp->next, 1); + tcf_proto_signal_destroying(chain, tp); + tp = tp_next; + } + tp = tcf_chain_dereference(chain->filter_chain, chain); RCU_INIT_POINTER(chain->filter_chain, NULL); tcf_chain0_head_change(chain, NULL); chain->flushing = true; @@ -844,6 +910,7 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, return ERR_PTR(-ENOMEM); } mutex_init(&block->lock); + mutex_init(&block->proto_destroy_lock); init_rwsem(&block->cb_lock); flow_block_init(&block->flow_block); INIT_LIST_HEAD(&block->chain_list); @@ -1621,6 +1688,12 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_lock(&chain->filter_chain_lock); + if (tcf_proto_exists_destroying(chain, tp_new)) { + mutex_unlock(&chain->filter_chain_lock); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); + return ERR_PTR(-EAGAIN); + } + tp = tcf_chain_tp_find(chain, &chain_info, protocol, prio, false); if (!tp) @@ -1628,10 +1701,10 @@ static struct tcf_proto *tcf_chain_tp_insert_unique(struct tcf_chain *chain, mutex_unlock(&chain->filter_chain_lock); if (tp) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = tp; } else if (err) { - tcf_proto_destroy(tp_new, rtnl_held, NULL); + tcf_proto_destroy(tp_new, rtnl_held, false, NULL); tp_new = ERR_PTR(err); } @@ -1669,6 +1742,7 @@ static void tcf_chain_tp_delete_empty(struct tcf_chain *chain, return; } + tcf_proto_signal_destroying(chain, tp); next = tcf_chain_dereference(chain_info.next, chain); if (tp == chain->filter_chain) tcf_chain0_head_change(chain, next); @@ -2188,6 +2262,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -EINVAL; goto errout_locked; } else if (t->tcm_handle == 0) { + tcf_proto_signal_destroying(chain, tp); tcf_chain_tp_remove(chain, &chain_info, tp); mutex_unlock(&chain->filter_chain_lock); -- cgit v1.2.3-70-g09d2 From 683916f6a84023407761d843048f1aea486b2612 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Nov 2019 15:36:57 -0800 Subject: net/tls: fix sk_msg trim on fallback to copy mode sk_msg_trim() tries to only update curr pointer if it falls into the trimmed region. The logic, however, does not take into the account pointer wrapping that sk_msg_iter_var_prev() does nor (as John points out) the fact that msg->sg is a ring buffer. This means that when the message was trimmed completely, the new curr pointer would have the value of MAX_MSG_FRAGS - 1, which is neither smaller than any other value, nor would it actually be correct. Special case the trimming to 0 length a little bit and rework the comparison between curr and end to take into account wrapping. This bug caused the TLS code to not copy all of the message, if zero copy filled in fewer sg entries than memcopy would need. Big thanks to Alexander Potapenko for the non-KMSAN reproducer. v2: - take into account that msg->sg is a ring buffer (John). Link: https://lore.kernel.org/netdev/20191030160542.30295-1-jakub.kicinski@netronome.com/ (v1) Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Reported-by: syzbot+f8495bff23a879a6d0bd@syzkaller.appspotmail.com Reported-by: syzbot+6f50c99e8f6194bf363f@syzkaller.appspotmail.com Co-developed-by: John Fastabend Signed-off-by: Jakub Kicinski Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/skmsg.h | 9 ++++++--- net/core/skmsg.c | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e4b3fb4bb77c..ce7055259877 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -139,6 +139,11 @@ static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes) } } +static inline u32 sk_msg_iter_dist(u32 start, u32 end) +{ + return end >= start ? end - start : end + (MAX_MSG_FRAGS - start); +} + #define sk_msg_iter_var_prev(var) \ do { \ if (var == 0) \ @@ -198,9 +203,7 @@ static inline u32 sk_msg_elem_used(const struct sk_msg *msg) if (sk_msg_full(msg)) return MAX_MSG_FRAGS; - return msg->sg.end >= msg->sg.start ? - msg->sg.end - msg->sg.start : - msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); + return sk_msg_iter_dist(msg->sg.start, msg->sg.end); } static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index cf390e0aa73d..ad31e4e53d0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -270,18 +270,28 @@ void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len) msg->sg.data[i].length -= trim; sk_mem_uncharge(sk, trim); + /* Adjust copybreak if it falls into the trimmed part of last buf */ + if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length) + msg->sg.copybreak = msg->sg.data[i].length; out: - /* If we trim data before curr pointer update copybreak and current - * so that any future copy operations start at new copy location. + sk_msg_iter_var_next(i); + msg->sg.end = i; + + /* If we trim data a full sg elem before curr pointer update + * copybreak and current so that any future copy operations + * start at new copy location. * However trimed data that has not yet been used in a copy op * does not require an update. */ - if (msg->sg.curr >= i) { + if (!msg->sg.size) { + msg->sg.curr = msg->sg.start; + msg->sg.copybreak = 0; + } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >= + sk_msg_iter_dist(msg->sg.start, msg->sg.end)) { + sk_msg_iter_var_prev(i); msg->sg.curr = i; msg->sg.copybreak = msg->sg.data[i].length; } - sk_msg_iter_var_next(i); - msg->sg.end = i; } EXPORT_SYMBOL_GPL(sk_msg_trim); -- cgit v1.2.3-70-g09d2 From f75359f3ac855940c5718af10ba089b8977bf339 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 21:38:43 -0800 Subject: net: prevent load/store tearing on sk->sk_stamp Add a couple of READ_ONCE() and WRITE_ONCE() to prevent load-tearing and store-tearing in sock_read_timestamp() and sock_write_timestamp() This might prevent another KCSAN report. Fixes: 3a0ed3e96197 ("sock: Make sock->sk_stamp thread-safe") Signed-off-by: Eric Dumazet Cc: Deepa Dinamani Acked-by: Deepa Dinamani Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 8f9adcfac41b..718e62fbe869 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2342,7 +2342,7 @@ static inline ktime_t sock_read_timestamp(struct sock *sk) return kt; #else - return sk->sk_stamp; + return READ_ONCE(sk->sk_stamp); #endif } @@ -2353,7 +2353,7 @@ static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else - sk->sk_stamp = kt; + WRITE_ONCE(sk->sk_stamp, kt); #endif } -- cgit v1.2.3-70-g09d2 From 169226f7e0d275c1879551f37484ef6683579a5c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 5 Nov 2019 21:16:30 -0800 Subject: mm: thp: handle page cache THP correctly in PageTransCompoundMap We have a usecase to use tmpfs as QEMU memory backend and we would like to take the advantage of THP as well. But, our test shows the EPT is not PMD mapped even though the underlying THP are PMD mapped on host. The number showed by /sys/kernel/debug/kvm/largepage is much less than the number of PMD mapped shmem pages as the below: 7f2778200000-7f2878200000 rw-s 00000000 00:14 262232 /dev/shm/qemu_back_mem.mem.Hz2hSf (deleted) Size: 4194304 kB [snip] AnonHugePages: 0 kB ShmemPmdMapped: 579584 kB [snip] Locked: 0 kB cat /sys/kernel/debug/kvm/largepages 12 And some benchmarks do worse than with anonymous THPs. By digging into the code we figured out that commit 127393fbe597 ("mm: thp: kvm: fix memory corruption in KVM with THP enabled") checks if there is a single PTE mapping on the page for anonymous THP when setting up EPT map. But the _mapcount < 0 check doesn't work for page cache THP since every subpage of page cache THP would get _mapcount inc'ed once it is PMD mapped, so PageTransCompoundMap() always returns false for page cache THP. This would prevent KVM from setting up PMD mapped EPT entry. So we need handle page cache THP correctly. However, when page cache THP's PMD gets split, kernel just remove the map instead of setting up PTE map like what anonymous THP does. Before KVM calls get_user_pages() the subpages may get PTE mapped even though it is still a THP since the page cache THP may be mapped by other processes at the mean time. Checking its _mapcount and whether the THP has PTE mapped or not. Although this may report some false negative cases (PTE mapped by other processes), it looks not trivial to make this accurate. With this fix /sys/kernel/debug/kvm/largepage would show reasonable pages are PMD mapped by EPT as the below: 7fbeaee00000-7fbfaee00000 rw-s 00000000 00:14 275464 /dev/shm/qemu_back_mem.mem.SKUvat (deleted) Size: 4194304 kB [snip] AnonHugePages: 0 kB ShmemPmdMapped: 557056 kB [snip] Locked: 0 kB cat /sys/kernel/debug/kvm/largepages 271 And the benchmarks are as same as anonymous THPs. [yang.shi@linux.alibaba.com: v4] Link: http://lkml.kernel.org/r/1571865575-42913-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1571769577-89735-1-git-send-email-yang.shi@linux.alibaba.com Fixes: dd78fedde4b9 ("rmap: support file thp") Signed-off-by: Yang Shi Reported-by: Gang Deng Tested-by: Gang Deng Suggested-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 ----- include/linux/mm_types.h | 5 +++++ include/linux/page-flags.h | 20 ++++++++++++++++++-- 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cc292273e6ba..a2adf95b3f9c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -695,11 +695,6 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) extern void kvfree(const void *addr); -static inline atomic_t *compound_mapcount_ptr(struct page *page) -{ - return &page[1].compound_mapcount; -} - static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2222fa795284..270aa8fd2800 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -221,6 +221,11 @@ struct page { #endif } _struct_page_alignment; +static inline atomic_t *compound_mapcount_ptr(struct page *page) +{ + return &page[1].compound_mapcount; +} + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f91cb8898ff0..1bf83c8fcaa7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -622,12 +622,28 @@ static inline int PageTransCompound(struct page *page) * * Unlike PageTransCompound, this is safe to be called only while * split_huge_pmd() cannot run from under us, like if protected by the - * MMU notifier, otherwise it may result in page->_mapcount < 0 false + * MMU notifier, otherwise it may result in page->_mapcount check false * positives. + * + * We have to treat page cache THP differently since every subpage of it + * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE + * mapped in the current process so comparing subpage's _mapcount to + * compound_mapcount to filter out PTE mapped case. */ static inline int PageTransCompoundMap(struct page *page) { - return PageTransCompound(page) && atomic_read(&page->_mapcount) < 0; + struct page *head; + + if (!PageTransCompound(page)) + return 0; + + if (PageAnon(page)) + return atomic_read(&page->_mapcount) < 0; + + head = compound_head(page); + /* File THP is PMD mapped and not PTE mapped */ + return atomic_read(&page->_mapcount) == + atomic_read(compound_mapcount_ptr(head)); } /* -- cgit v1.2.3-70-g09d2 From 86de88cfeb7cf33c7bbd18360e041c7d4e651bba Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 4 Nov 2019 09:37:36 -0800 Subject: drm/atomic: fix self-refresh helpers crtc state dereference drm_self_refresh_helper_update_avg_times() was incorrectly accessing the new incoming state after drm_atomic_helper_commit_hw_done(). But this state might have already been superceeded by an !nonblock atomic update resulting in dereferencing an already free'd crtc_state. TODO I *think* this will more or less do the right thing.. althought I'm not 100% sure if, for example, we enter psr in a nonblock commit, and then leave psr in a !nonblock commit that overtakes the completion of the nonblock commit. Not sure if this sort of scenario can happen in practice. But not crashing is better than crashing, so I guess we should either take this patch or rever the self-refresh helpers until Sean can figure out a better solution. Fixes: d4da4e33341c ("drm: Measure Self Refresh Entry/Exit times to avoid thrashing") Cc: Sean Paul Signed-off-by: Rob Clark [seanpaul fixed up some checkpatch warns] Signed-off-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20191104173737.142558-1-robdclark@gmail.com --- drivers/gpu/drm/drm_atomic_helper.c | 15 ++++++++++++++- drivers/gpu/drm/drm_self_refresh_helper.c | 18 +++++++++++------- include/drm/drm_self_refresh_helper.h | 3 ++- 3 files changed, 27 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 3ef2ac52ce94..2dd2cd87cdbb 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -1581,8 +1581,11 @@ static void commit_tail(struct drm_atomic_state *old_state) { struct drm_device *dev = old_state->dev; const struct drm_mode_config_helper_funcs *funcs; + struct drm_crtc_state *new_crtc_state; + struct drm_crtc *crtc; ktime_t start; s64 commit_time_ms; + unsigned int i, new_self_refresh_mask = 0; funcs = dev->mode_config.helper_private; @@ -1602,6 +1605,15 @@ static void commit_tail(struct drm_atomic_state *old_state) drm_atomic_helper_wait_for_dependencies(old_state); + /* + * We cannot safely access new_crtc_state after + * drm_atomic_helper_commit_hw_done() so figure out which crtc's have + * self-refresh active beforehand: + */ + for_each_new_crtc_in_state(old_state, crtc, new_crtc_state, i) + if (new_crtc_state->self_refresh_active) + new_self_refresh_mask |= BIT(i); + if (funcs && funcs->atomic_commit_tail) funcs->atomic_commit_tail(old_state); else @@ -1610,7 +1622,8 @@ static void commit_tail(struct drm_atomic_state *old_state) commit_time_ms = ktime_ms_delta(ktime_get(), start); if (commit_time_ms > 0) drm_self_refresh_helper_update_avg_times(old_state, - (unsigned long)commit_time_ms); + (unsigned long)commit_time_ms, + new_self_refresh_mask); drm_atomic_helper_commit_cleanup_done(old_state); diff --git a/drivers/gpu/drm/drm_self_refresh_helper.c b/drivers/gpu/drm/drm_self_refresh_helper.c index 68f4765a5896..dd33fec5aabd 100644 --- a/drivers/gpu/drm/drm_self_refresh_helper.c +++ b/drivers/gpu/drm/drm_self_refresh_helper.c @@ -133,29 +133,33 @@ out_drop_locks: * drm_self_refresh_helper_update_avg_times - Updates a crtc's SR time averages * @state: the state which has just been applied to hardware * @commit_time_ms: the amount of time in ms that this commit took to complete + * @new_self_refresh_mask: bitmask of crtc's that have self_refresh_active in + * new state * * Called after &drm_mode_config_funcs.atomic_commit_tail, this function will * update the average entry/exit self refresh times on self refresh transitions. * These averages will be used when calculating how long to delay before * entering self refresh mode after activity. */ -void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, - unsigned int commit_time_ms) +void +drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, + unsigned int commit_time_ms, + unsigned int new_self_refresh_mask) { struct drm_crtc *crtc; - struct drm_crtc_state *old_crtc_state, *new_crtc_state; + struct drm_crtc_state *old_crtc_state; int i; - for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, - new_crtc_state, i) { + for_each_old_crtc_in_state(state, crtc, old_crtc_state, i) { + bool new_self_refresh_active = new_self_refresh_mask & BIT(i); struct drm_self_refresh_data *sr_data = crtc->self_refresh_data; struct ewma_psr_time *time; if (old_crtc_state->self_refresh_active == - new_crtc_state->self_refresh_active) + new_self_refresh_active) continue; - if (new_crtc_state->self_refresh_active) + if (new_self_refresh_active) time = &sr_data->entry_avg_ms; else time = &sr_data->exit_avg_ms; diff --git a/include/drm/drm_self_refresh_helper.h b/include/drm/drm_self_refresh_helper.h index 5b79d253fb46..520235c20708 100644 --- a/include/drm/drm_self_refresh_helper.h +++ b/include/drm/drm_self_refresh_helper.h @@ -13,7 +13,8 @@ struct drm_crtc; void drm_self_refresh_helper_alter_state(struct drm_atomic_state *state); void drm_self_refresh_helper_update_avg_times(struct drm_atomic_state *state, - unsigned int commit_time_ms); + unsigned int commit_time_ms, + unsigned int new_self_refresh_mask); int drm_self_refresh_helper_init(struct drm_crtc *crtc); void drm_self_refresh_helper_cleanup(struct drm_crtc *crtc); -- cgit v1.2.3-70-g09d2 From 105401b659b7eb9cb42d6b5b75d5c049ad4b3dca Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 1 Nov 2019 10:37:54 -0500 Subject: drm/shmem: Add docbook comments for drm_gem_shmem_object madvise fields Add missing docbook comments to madvise fields in struct drm_gem_shmem_object which fixes these warnings: include/drm/drm_gem_shmem_helper.h:87: warning: Function parameter or member 'madv' not described in 'drm_gem_shmem_object' include/drm/drm_gem_shmem_helper.h:87: warning: Function parameter or member 'madv_list' not described in 'drm_gem_shmem_object' Fixes: 17acb9f35ed7 ("drm/shmem: Add madvise state and purge helpers") Reported-by: Sean Paul Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Rob Herring Reviewed-by: Sean Paul Link: https://patchwork.freedesktop.org/patch/msgid/20191101153754.22803-1-robh@kernel.org --- include/drm/drm_gem_shmem_helper.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h index 01f514521687..7865e6b5d36c 100644 --- a/include/drm/drm_gem_shmem_helper.h +++ b/include/drm/drm_gem_shmem_helper.h @@ -44,7 +44,20 @@ struct drm_gem_shmem_object { */ unsigned int pages_use_count; + /** + * @madv: State for madvise + * + * 0 is active/inuse. + * A negative value is the object is purged. + * Positive values are driver specific and not used by the helpers. + */ int madv; + + /** + * @madv_list: List entry for madvise tracking + * + * Typically used by drivers to track purgeable objects + */ struct list_head madv_list; /** -- cgit v1.2.3-70-g09d2 From 79ffe6087e9145d2377385cac48d0d6a6b4225a5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Nov 2019 14:24:35 -0800 Subject: net/tls: add a TX lock TLS TX needs to release and re-acquire the socket lock if send buffer fills up. TLS SW TX path currently depends on only allowing one thread to enter the function by the abuse of sk_write_pending. If another writer is already waiting for memory no new ones are allowed in. This has two problems: - writers don't wake other threads up when they leave the kernel; meaning that this scheme works for single extra thread (second application thread or delayed work) because memory becoming available will send a wake up request, but as Mallesham and Pooja report with larger number of threads it leads to threads being put to sleep indefinitely; - the delayed work does not get _scheduled_ but it may _run_ when other writers are present leading to crashes as writers don't expect state to change under their feet (same records get pushed and freed multiple times); it's hard to reliably bail from the work, however, because the mere presence of a writer does not guarantee that the writer will push pending records before exiting. Ensuring wakeups always happen will make the code basically open code a mutex. Just use a mutex. The TLS HW TX path does not have any locking (not even the sk_write_pending hack), yet it uses a per-socket sg_tx_data array to push records. Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Reported-by: Mallesham Jatharakonda Reported-by: Pooja Trivedi Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 5 +++++ net/tls/tls_device.c | 6 ++++++ net/tls/tls_main.c | 2 ++ net/tls/tls_sw.c | 21 +++++++-------------- 4 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index c664e6dba0d1..794e297483ea 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -269,6 +270,10 @@ struct tls_context { bool in_tcp_sendpages; bool pending_open_record_frags; + + struct mutex tx_lock; /* protects partially_sent_* fields and + * per-type TX fields + */ unsigned long flags; /* cache cold stuff */ diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 5a3715ddc592..683d00837693 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -523,8 +523,10 @@ last_record: int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { unsigned char record_type = TLS_RECORD_TYPE_DATA; + struct tls_context *tls_ctx = tls_get_ctx(sk); int rc; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (unlikely(msg->msg_controllen)) { @@ -538,12 +540,14 @@ int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); struct iov_iter msg_iter; char *kaddr = kmap(page); struct kvec iov; @@ -552,6 +556,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, if (flags & MSG_SENDPAGE_NOTLAST) flags |= MSG_MORE; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); if (flags & MSG_OOB) { @@ -568,6 +573,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page, out: release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return rc; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ac88877dcade..0775ae40fcfb 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -267,6 +267,7 @@ void tls_ctx_free(struct sock *sk, struct tls_context *ctx) memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); + mutex_destroy(&ctx->tx_lock); if (sk) kfree_rcu(ctx, rcu); @@ -612,6 +613,7 @@ static struct tls_context *create_ctx(struct sock *sk) if (!ctx) return NULL; + mutex_init(&ctx->tx_lock); rcu_assign_pointer(icsk->icsk_ulp_data, ctx); ctx->sk_proto = sk->sk_prot; return ctx; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index e155b792df0b..446f23c1f3ce 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -897,15 +897,9 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -ENOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto send_end; - } - if (unlikely(msg->msg_controllen)) { ret = tls_proccess_cmsg(sk, msg, &record_type); if (ret) { @@ -1091,6 +1085,7 @@ send_end: ret = sk_stream_error(sk, msg->msg_flags, ret); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return copied ? copied : ret; } @@ -1114,13 +1109,6 @@ static int tls_sw_do_sendpage(struct sock *sk, struct page *page, eor = !(flags & (MSG_MORE | MSG_SENDPAGE_NOTLAST)); sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - /* Wait till there is any pending write on socket */ - if (unlikely(sk->sk_write_pending)) { - ret = wait_on_pending_writer(sk, &timeo); - if (unlikely(ret)) - goto sendpage_end; - } - /* Call the sk_stream functions to manage the sndbuf mem. */ while (size > 0) { size_t copy, required_size; @@ -1219,15 +1207,18 @@ sendpage_end: int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags) { + struct tls_context *tls_ctx = tls_get_ctx(sk); int ret; if (flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_SENDPAGE_NOTLAST | MSG_SENDPAGE_NOPOLICY)) return -ENOTSUPP; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); ret = tls_sw_do_sendpage(sk, page, offset, size, flags); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); return ret; } @@ -2170,9 +2161,11 @@ static void tx_work_handler(struct work_struct *work) if (!test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) return; + mutex_lock(&tls_ctx->tx_lock); lock_sock(sk); tls_tx_records(sk, -1); release_sock(sk); + mutex_unlock(&tls_ctx->tx_lock); } void tls_sw_write_space(struct sock *sk, struct tls_context *ctx) -- cgit v1.2.3-70-g09d2 From 71e67c3bd127cfe7863f54e4b087eba1cc8f9a7a Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 5 Nov 2019 16:57:50 +0100 Subject: net/fq_impl: Switch to kvmalloc() for memory allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FQ implementation used by mac80211 allocates memory using kmalloc(), which can fail; and Johannes reported that this actually happens in practice. To avoid this, switch the allocation to kvmalloc() instead; this also brings fq_impl in line with all the FQ qdiscs. Fixes: 557fc4a09803 ("fq: add fair queuing framework") Reported-by: Johannes Berg Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20191105155750.547379-1-toke@redhat.com Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 107c0d700ed6..38a9a3d1222b 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -313,7 +313,7 @@ static int fq_init(struct fq *fq, int flows_cnt) fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ - fq->flows = kcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); + fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); if (!fq->flows) return -ENOMEM; @@ -331,7 +331,7 @@ static void fq_reset(struct fq *fq, for (i = 0; i < fq->flows_cnt; i++) fq_flow_reset(fq, &fq->flows[i], free_func); - kfree(fq->flows); + kvfree(fq->flows); fq->flows = NULL; } -- cgit v1.2.3-70-g09d2 From 1b53d64435d56902fc234ff2507142d971a09687 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2019 20:08:19 -0800 Subject: net: fix data-race in neigh_event_send() KCSAN reported the following data-race [1] The fix will also prevent the compiler from optimizing out the condition. [1] BUG: KCSAN: data-race in neigh_resolve_output / neigh_resolve_output write to 0xffff8880a41dba78 of 8 bytes by interrupt on cpu 1: neigh_event_send include/net/neighbour.h:443 [inline] neigh_resolve_output+0x78/0x480 net/core/neighbour.c:1474 neigh_output include/net/neighbour.h:511 [inline] ip_finish_output2+0x4af/0xe40 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x23a/0x490 net/ipv4/ip_output.c:290 ip_finish_output+0x41/0x160 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_output+0xdf/0x210 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:436 [inline] ip_local_out+0x74/0x90 net/ipv4/ip_output.c:125 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:237 __tcp_transmit_skb+0xe81/0x1d60 net/ipv4/tcp_output.c:1169 tcp_transmit_skb net/ipv4/tcp_output.c:1185 [inline] __tcp_retransmit_skb+0x4bd/0x15f0 net/ipv4/tcp_output.c:2976 tcp_retransmit_skb+0x36/0x1a0 net/ipv4/tcp_output.c:2999 tcp_retransmit_timer+0x719/0x16d0 net/ipv4/tcp_timer.c:515 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:598 tcp_write_timer+0xd1/0xf0 net/ipv4/tcp_timer.c:618 read to 0xffff8880a41dba78 of 8 bytes by interrupt on cpu 0: neigh_event_send include/net/neighbour.h:442 [inline] neigh_resolve_output+0x57/0x480 net/core/neighbour.c:1474 neigh_output include/net/neighbour.h:511 [inline] ip_finish_output2+0x4af/0xe40 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x23a/0x490 net/ipv4/ip_output.c:290 ip_finish_output+0x41/0x160 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_output+0xdf/0x210 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:436 [inline] ip_local_out+0x74/0x90 net/ipv4/ip_output.c:125 __ip_queue_xmit+0x3a8/0xa40 net/ipv4/ip_output.c:532 ip_queue_xmit+0x45/0x60 include/net/ip.h:237 __tcp_transmit_skb+0xe81/0x1d60 net/ipv4/tcp_output.c:1169 tcp_transmit_skb net/ipv4/tcp_output.c:1185 [inline] __tcp_retransmit_skb+0x4bd/0x15f0 net/ipv4/tcp_output.c:2976 tcp_retransmit_skb+0x36/0x1a0 net/ipv4/tcp_output.c:2999 tcp_retransmit_timer+0x719/0x16d0 net/ipv4/tcp_timer.c:515 tcp_write_timer_handler+0x42d/0x510 net/ipv4/tcp_timer.c:598 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/neighbour.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 50a67bd6a434..b8452cc0e059 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -439,8 +439,8 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; - if (neigh->used != now) - neigh->used = now; + if (READ_ONCE(neigh->used) != now) + WRITE_ONCE(neigh->used, now); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); return 0; -- cgit v1.2.3-70-g09d2