54 files changed, 5946 insertions, 1186 deletions
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909dc01ec..5d644f180fe5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -457,9 +457,6 @@ config NLATTR
 config GENERIC_ATOMIC64
        bool
 
-config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	def_bool y if GENERIC_ATOMIC64
-
 config LRU_CACHE
 	tristate
 
@@ -550,4 +547,10 @@ config STACKDEPOT
 	bool
 	select STACKTRACE
 
+config SBITMAP
+	bool
+
+config PARMAN
+	tristate "parman"
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cab7405f48d2..66fb4389f05c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -13,7 +13,22 @@ config PRINTK_TIME
 	  be included, not that the timestamp is recorded.
 
 	  The behavior is also controlled by the kernel command line
-	  parameter printk.time=1. See Documentation/kernel-parameters.txt
+	  parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst
+
+config CONSOLE_LOGLEVEL_DEFAULT
+	int "Default console loglevel (1-15)"
+	range 1 15
+	default "7"
+	help
+	  Default loglevel to determine what will be printed on the console.
+
+	  Setting a default here is equivalent to passing in loglevel=<x> in
+	  the kernel bootargs. loglevel=<x> continues to override whatever
+	  value is specified here as well.
+
+	  Note: This does not affect the log level of un-prefixed printk()
+	  usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
+	  option.
 
 config MESSAGE_LOGLEVEL_DEFAULT
 	int "Default message log level (1-7)"
@@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
 	  that are auditing their logs closely may want to set it to a lower
 	  priority.
 
+	  Note: This does not affect what message level gets printed on the console
+	  by default. To change that, use loglevel=<x> in the kernel bootargs,
+	  or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
+
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -145,7 +164,7 @@ config DEBUG_INFO_REDUCED
 
 config DEBUG_INFO_SPLIT
 	bool "Produce split debuginfo in .dwo files"
-	depends on DEBUG_INFO
+	depends on DEBUG_INFO && !FRV
 	help
 	  Generate debug info into separate .dwo files. This significantly
 	  reduces the build directory size for builds with DEBUG_INFO,
@@ -175,8 +194,8 @@ config GDB_SCRIPTS
 	  build directory. If you load vmlinux into gdb, the helper
 	  scripts will be automatically imported by gdb as well, and
 	  additional functions are available to analyze a Linux kernel
-	  instance. See Documentation/gdb-kernel-debugging.txt for further
-	  details.
+	  instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
+	  for further details.
 
 config ENABLE_WARN_DEPRECATED
 	bool "Enable __deprecated logic"
@@ -198,6 +217,7 @@ config FRAME_WARN
 	int "Warn for stack frames larger than (needs gcc 4.4)"
 	range 0 8192
 	default 0 if KASAN
+	default 2048 if GCC_PLUGIN_LATENT_ENTROPY
 	default 1024 if !64BIT
 	default 2048 if 64BIT
 	help
@@ -305,7 +325,7 @@ config DEBUG_SECTION_MISMATCH
 	    a larger kernel).
 	  - Run the section mismatch analysis for each module/built-in.o file.
 	    When we run the section mismatch analysis on vmlinux.o, we
-	    lose valueble information about where the mismatch was
+	    lose valuable information about where the mismatch was
 	    introduced.
 	    Running the analysis for each module/built-in.o file
 	    tells where the mismatch happens much closer to the
@@ -396,6 +416,16 @@ config MAGIC_SYSRQ_DEFAULT_ENABLE
 	  This may be set to 1 or 0 to enable or disable them all, or
 	  to a bitmask as described in Documentation/sysrq.txt.
 
+config MAGIC_SYSRQ_SERIAL
+	bool "Enable magic SysRq key over serial"
+	depends on MAGIC_SYSRQ
+	default y
+	help
+	  Many embedded boards have a disconnected TTL level serial which can
+	  generate some garbage that can lead to spurious false sysrq detects.
+	  This option allows you to decide whether you want to enable the
+	  magic SysRq key.
+
 config DEBUG_KERNEL
 	bool "Kernel debugging"
 	help
@@ -522,7 +552,7 @@ config DEBUG_KMEMLEAK
 	  difference being that the orphan objects are not freed but
 	  only shown in /sys/kernel/debug/kmemleak. Enabling this
 	  feature will introduce an overhead to memory
-	  allocations. See Documentation/kmemleak.txt for more
+	  allocations. See Documentation/dev-tools/kmemleak.rst for more
 	  details.
 
 	  Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
@@ -602,9 +632,12 @@ config DEBUG_VM_PGFLAGS
 
 	  If unsure, say N.
 
+config ARCH_HAS_DEBUG_VIRTUAL
+	bool
+
 config DEBUG_VIRTUAL
 	bool "Debug VM translations"
-	depends on DEBUG_KERNEL && X86
+	depends on DEBUG_KERNEL && ARCH_HAS_DEBUG_VIRTUAL
 	help
 	  Enable some costly sanity checks in virtual to page code. This can
 	  catch mistakes with virt_to_page() and friends.
@@ -696,6 +729,19 @@ source "lib/Kconfig.kmemcheck"
 
 source "lib/Kconfig.kasan"
 
+config DEBUG_REFCOUNT
+	bool "Verbose refcount checks"
+	help
+	  Say Y here if you want reference counters (refcount_t and kref) to
+	  generate WARNs on dubious usage. Without this refcount_t will still
+	  be a saturating counter and avoid Use-After-Free by turning it into
+	  a resource leak Denial-Of-Service.
+
+	  Use of this option will increase kernel text size but will alert the
+	  admin of potential abuse.
+
+	  If in doubt, say "N".
+
 endmenu # "Memory Debugging"
 
 config ARCH_HAS_KCOV
@@ -719,7 +765,7 @@ config KCOV
 	  different machines and across reboots. If you need stable PC values,
 	  disable RANDOMIZE_BASE.
 
-	  For more details, see Documentation/kcov.txt.
+	  For more details, see Documentation/dev-tools/kcov.rst.
 
 config KCOV_INSTRUMENT_ALL
 	bool "Instrument all code by default"
@@ -960,20 +1006,6 @@ config DEBUG_TIMEKEEPING
 
 	  If unsure, say N.
 
-config TIMER_STATS
-	bool "Collect kernel timers statistics"
-	depends on DEBUG_KERNEL && PROC_FS
-	help
-	  If you say Y here, additional code will be inserted into the
-	  timer routines to collect statistics about kernel timers being
-	  reprogrammed. The statistics can be read from /proc/timer_stats.
-	  The statistics collection is started by writing 1 to /proc/timer_stats,
-	  writing 0 stops it. This feature is useful to collect information
-	  about timer usage patterns in kernel and userspace. This feature
-	  is lightweight if enabled in the kernel config but not activated
-	  (it defaults to deactivated on bootup and will only be activated
-	  if some application like powertop activates it explicitly).
-
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT
@@ -1084,6 +1116,9 @@ config PROVE_LOCKING
 
 	 For more details, see Documentation/locking/lockdep-design.txt.
 
+config PROVE_LOCKING_SMALL
+	bool
+
 config LOCKDEP
 	bool
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
@@ -1157,6 +1192,18 @@ config LOCK_TORTURE_TEST
 	  Say M if you want these torture tests to build as a module.
 	  Say N if you are unsure.
 
+config WW_MUTEX_SELFTEST
+	tristate "Wait/wound mutex selftests"
+	help
+	  This option provides a kernel module that runs tests on the
+	  on the struct ww_mutex locking API.
+
+	  It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction
+	  with this test harness.
+
+	  Say M if you want these self tests to build as a module.
+	  Say N if you are unsure.
+
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
@@ -1214,7 +1261,7 @@ config DEBUG_BUGVERBOSE
 
 config DEBUG_LIST
 	bool "Debug linked list manipulation"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION
 	help
 	  Enable this to turn on extended checks in the linked-list
 	  walking routines.
@@ -1427,10 +1474,12 @@ config RCU_CPU_STALL_TIMEOUT
 config RCU_TRACE
 	bool "Enable tracing for RCU"
 	depends on DEBUG_KERNEL
+	default y if TREE_RCU
 	select TRACE_CLOCK
 	help
 	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.
+	  in debugfs for debugging RCU implementation.  It also enables
+	  additional tracepoints for ftrace-style event tracing.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
@@ -1514,30 +1563,6 @@ config NOTIFIER_ERROR_INJECTION
 
 	  Say N if unsure.
 
-config CPU_NOTIFIER_ERROR_INJECT
-	tristate "CPU notifier error injection module"
-	depends on HOTPLUG_CPU && NOTIFIER_ERROR_INJECTION
-	help
-	  This option provides a kernel module that can be used to test
-	  the error handling of the cpu notifiers by injecting artificial
-	  errors to CPU notifier chain callbacks.  It is controlled through
-	  debugfs interface under /sys/kernel/debug/notifier-error-inject/cpu
-
-	  If the notifier call chain should be failed with some events
-	  notified, write the error code to "actions/<notifier event>/error".
-
-	  Example: Inject CPU offline error (-1 == -EPERM)
-
-	  # cd /sys/kernel/debug/notifier-error-inject/cpu
-	  # echo -1 > actions/CPU_DOWN_PREPARE/error
-	  # echo 0 > /sys/devices/system/cpu/cpu1/online
-	  bash: echo: write error: Operation not permitted
-
-	  To compile this code as a module, choose M here: the module will
-	  be called cpu-notifier-error-inject.
-
-	  If unsure, say N.
-
 config PM_NOTIFIER_ERROR_INJECT
 	tristate "PM notifier error injection module"
 	depends on PM && NOTIFIER_ERROR_INJECTION
@@ -1819,13 +1844,23 @@ config TEST_HASH
 	tristate "Perform selftest on hash functions"
 	default n
 	help
-	  Enable this option to test the kernel's integer (<linux/hash,h>)
-	  and string (<linux/stringhash.h>) hash functions on boot
-	  (or module load).
+	  Enable this option to test the kernel's integer (<linux/hash.h>),
+	  string (<linux/stringhash.h>), and siphash (<linux/siphash.h>)
+	  hash functions on boot (or module load).
 
 	  This is intended to help people writing architecture-specific
 	  optimized versions.  If unsure, say N.
 
+config TEST_PARMAN
+	tristate "Perform selftest on priority array manager"
+	default n
+	depends on PARMAN
+	help
+	  Enable this option to test priority array manager on boot
+	  (or module load).
+
+	  If unsure, say N.
+
 endmenu # runtime tests
 
 config PROVIDE_OHCI1394_DMA_INIT
@@ -1857,15 +1892,6 @@ config PROVIDE_OHCI1394_DMA_INIT
 
 	  See Documentation/debugging-via-ohci1394.txt for more information.
 
-config BUILD_DOCSRC
-	bool "Build targets in Documentation/ tree"
-	depends on HEADERS_CHECK
-	help
-	  This option attempts to build objects from the source files in the
-	  kernel Documentation/ tree.
-
-	  Say N if you are unsure.
-
 config DMA_API_DEBUG
 	bool "Enable debugging of DMA-API usage"
 	depends on HAVE_DMA_API_DEBUG
@@ -1969,6 +1995,16 @@ config TEST_STATIC_KEYS
 
 	  If unsure, say N.
 
+config BUG_ON_DATA_CORRUPTION
+	bool "Trigger a BUG when data corruption is detected"
+	select DEBUG_LIST
+	help
+	  Select this option if the kernel should BUG when it encounters
+	  data corruption in kernel memory structures when they get checked
+	  for validity.
+
+	  If unsure, say N.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
@@ -1980,7 +2016,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
 
 config STRICT_DEVMEM
 	bool "Filter access to /dev/mem"
-	depends on MMU
+	depends on MMU && DEVMEM
 	depends on ARCH_HAS_DEVMEM_IS_ALLOWED
 	default y if TILE || PPC
 	---help---
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index bc6e651df68c..a669c193b878 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -10,7 +10,8 @@ config UBSAN
 	  This option enables undefined behaviour sanity checker
 	  Compile-time instrumentation is used to detect various undefined
 	  behaviours in runtime. Various types of checks may be enabled
-	  via boot parameter ubsan_handle (see: Documentation/ubsan.txt).
+	  via boot parameter ubsan_handle
+	  (see: Documentation/dev-tools/ubsan.rst).
 
 config UBSAN_SANITIZE_ALL
 	bool "Enable instrumentation for the entire kernel"
diff --git a/lib/Makefile b/lib/Makefile
index 5dc77a8ec297..6b768b58a38d 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
+	 earlycpio.o seq_buf.o siphash.o \
+	 nmi_backtrace.o nodemask.o win_minmax.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -31,7 +32,7 @@ lib-$(CONFIG_HAS_DMA) += dma-noop.o
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
 
-obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
+obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
@@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_BPF) += test_bpf.o
 obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o
-obj-$(CONFIG_TEST_HASH) += test_hash.o
+obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o
 obj-$(CONFIG_TEST_KASAN) += test_kasan.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_LKM) += test_module.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o
 obj-$(CONFIG_TEST_PRINTF) += test_printf.o
 obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o
 obj-$(CONFIG_TEST_UUID) += test_uuid.o
+obj-$(CONFIG_TEST_PARMAN) += test_parman.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
@@ -128,7 +130,6 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o
 obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
-obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
 obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o
 obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o
@@ -180,6 +181,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o
 
 obj-$(CONFIG_STACKDEPOT) += stackdepot.o
 KASAN_SANITIZE_stackdepot.o := n
+KCOV_INSTRUMENT_stackdepot.o := n
 
 libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \
 	       fdt_empty_tree.o
@@ -227,3 +229,7 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
 obj-$(CONFIG_UBSAN) += ubsan.o
 
 UBSAN_SANITIZE_ubsan.o := n
+
+obj-$(CONFIG_SBITMAP) += sbitmap.o
+
+obj-$(CONFIG_PARMAN) += parman.o
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index dbb369145dda..46042901130f 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -213,7 +213,6 @@ static __init void test_atomic64(void)
 	r += one;
 	BUG_ON(v.counter != r);
 
-#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	INIT(onestwos);
 	BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1));
 	r -= one;
@@ -226,9 +225,6 @@ static __init void test_atomic64(void)
 	INIT(-one);
 	BUG_ON(atomic64_dec_if_positive(&v) != (-one - one));
 	BUG_ON(v.counter != r);
-#else
-#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol
-#endif
 
 	INIT(onestwos);
 	BUG_ON(!atomic64_inc_not_zero(&v));
diff --git a/lib/bitmap.c b/lib/bitmap.c
index eca88087fa8a..0b66f0e5eb6b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf);
  * ranges.  Consecutively set bits are shown as two hyphen-separated
  * decimal numbers, the smallest and largest bit numbers set in
  * the range.
+ * Optionally each range can be postfixed to denote that only parts of it
+ * should be set. The range will divided to groups of specific size.
+ * From each group will be used only defined amount of bits.
+ * Syntax: range:used_size/group_size
+ * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
  *
  * Returns 0 on success, -errno on invalid input strings.
  * Error values:
@@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 		int is_user, unsigned long *maskp,
 		int nmaskbits)
 {
-	unsigned a, b;
+	unsigned int a, b, old_a, old_b;
+	unsigned int group_size, used_size;
 	int c, old_c, totaldigits, ndigits;
 	const char __user __force *ubuf = (const char __user __force *)buf;
-	int at_start, in_range;
+	int at_start, in_range, in_partial_range;
 
 	totaldigits = c = 0;
+	old_a = old_b = 0;
+	group_size = used_size = 0;
 	bitmap_zero(maskp, nmaskbits);
 	do {
 		at_start = 1;
 		in_range = 0;
+		in_partial_range = 0;
 		a = b = 0;
 		ndigits = totaldigits;
 
@@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 			if ((totaldigits != ndigits) && isspace(old_c))
 				return -EINVAL;
 
+			if (c == '/') {
+				used_size = a;
+				at_start = 1;
+				in_range = 0;
+				a = b = 0;
+				continue;
+			}
+
+			if (c == ':') {
+				old_a = a;
+				old_b = b;
+				at_start = 1;
+				in_range = 0;
+				in_partial_range = 1;
+				a = b = 0;
+				continue;
+			}
+
 			if (c == '-') {
 				if (at_start || in_range)
 					return -EINVAL;
@@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
 		}
 		if (ndigits == totaldigits)
 			continue;
+		if (in_partial_range) {
+			group_size = a;
+			a = old_a;
+			b = old_b;
+			old_a = old_b = 0;
+		}
 		/* if no digit is after '-', it's wrong*/
 		if (at_start && in_range)
 			return -EINVAL;
-		if (!(a <= b))
+		if (!(a <= b) || !(used_size <= group_size))
 			return -EINVAL;
 		if (b >= nmaskbits)
 			return -ERANGE;
 		while (a <= b) {
-			set_bit(a, maskp);
+			if (in_partial_range) {
+				static int pos_in_group = 1;
+
+				if (pos_in_group <= used_size)
+					set_bit(a, maskp);
+
+				if (a == b || ++pos_in_group > group_size)
+					pos_in_group = 1;
+			} else
+				set_bit(a, maskp);
 			a++;
 		}
 	} while (buflen && c == ',');
diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c
deleted file mode 100644
index 0e2c9a1e958a..000000000000
--- a/lib/cpu-notifier-error-inject.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/cpu.h>
-
-#include "notifier-error-inject.h"
-
-static int priority;
-module_param(priority, int, 0);
-MODULE_PARM_DESC(priority, "specify cpu notifier priority");
-
-#define UP_PREPARE 0
-#define UP_PREPARE_FROZEN 0
-#define DOWN_PREPARE 0
-#define DOWN_PREPARE_FROZEN 0
-
-static struct notifier_err_inject cpu_notifier_err_inject = {
-	.actions = {
-		{ NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) },
-		{ NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) },
-		{ NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) },
-		{ NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) },
-		{}
-	}
-};
-
-static int notf_err_handle(struct notifier_err_inject_action *action)
-{
-	int ret;
-
-	ret = action->error;
-	if (ret)
-		pr_info("Injecting error (%d) to %s\n", ret, action->name);
-	return ret;
-}
-
-static int notf_err_inj_up_prepare(unsigned int cpu)
-{
-	if (!cpuhp_tasks_frozen)
-		return notf_err_handle(&cpu_notifier_err_inject.actions[0]);
-	else
-		return notf_err_handle(&cpu_notifier_err_inject.actions[1]);
-}
-
-static int notf_err_inj_dead(unsigned int cpu)
-{
-	if (!cpuhp_tasks_frozen)
-		return notf_err_handle(&cpu_notifier_err_inject.actions[2]);
-	else
-		return notf_err_handle(&cpu_notifier_err_inject.actions[3]);
-}
-
-static struct dentry *dir;
-
-static int err_inject_init(void)
-{
-	int err;
-
-	dir = notifier_err_inject_init("cpu", notifier_err_inject_dir,
-					&cpu_notifier_err_inject, priority);
-	if (IS_ERR(dir))
-		return PTR_ERR(dir);
-
-	err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE,
-					"cpu-err-notif:prepare",
-					notf_err_inj_up_prepare,
-					notf_err_inj_dead);
-	if (err)
-		debugfs_remove_recursive(dir);
-
-	return err;
-}
-
-static void err_inject_exit(void)
-{
-	cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE);
-	debugfs_remove_recursive(dir);
-}
-
-module_init(err_inject_init);
-module_exit(err_inject_exit);
-
-MODULE_DESCRIPTION("CPU notifier error injection module");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>");
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index a8e12601eb37..8c28cbd7e104 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -52,9 +52,18 @@ static int			debug_objects_fixups __read_mostly;
 static int			debug_objects_warnings __read_mostly;
 static int			debug_objects_enabled __read_mostly
 				= CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT;
-
+static int			debug_objects_pool_size __read_mostly
+				= ODEBUG_POOL_SIZE;
+static int			debug_objects_pool_min_level __read_mostly
+				= ODEBUG_POOL_MIN_LEVEL;
 static struct debug_obj_descr	*descr_test  __read_mostly;
 
+/*
+ * Track numbers of kmem_cache_alloc()/free() calls done.
+ */
+static int			debug_objects_allocated;
+static int			debug_objects_freed;
+
 static void free_obj_work(struct work_struct *work);
 static DECLARE_WORK(debug_obj_work, free_obj_work);
 
@@ -88,13 +97,13 @@ static void fill_pool(void)
 	struct debug_obj *new;
 	unsigned long flags;
 
-	if (likely(obj_pool_free >= ODEBUG_POOL_MIN_LEVEL))
+	if (likely(obj_pool_free >= debug_objects_pool_min_level))
 		return;
 
 	if (unlikely(!obj_cache))
 		return;
 
-	while (obj_pool_free < ODEBUG_POOL_MIN_LEVEL) {
+	while (obj_pool_free < debug_objects_pool_min_level) {
 
 		new = kmem_cache_zalloc(obj_cache, gfp);
 		if (!new)
@@ -102,6 +111,7 @@ static void fill_pool(void)
 
 		raw_spin_lock_irqsave(&pool_lock, flags);
 		hlist_add_head(&new->node, &obj_pool);
+		debug_objects_allocated++;
 		obj_pool_free++;
 		raw_spin_unlock_irqrestore(&pool_lock, flags);
 	}
@@ -162,24 +172,39 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr)
 
 /*
  * workqueue function to free objects.
+ *
+ * To reduce contention on the global pool_lock, the actual freeing of
+ * debug objects will be delayed if the pool_lock is busy. We also free
+ * the objects in a batch of 4 for each lock/unlock cycle.
  */
+#define ODEBUG_FREE_BATCH	4
+
 static void free_obj_work(struct work_struct *work)
 {
-	struct debug_obj *obj;
+	struct debug_obj *objs[ODEBUG_FREE_BATCH];
 	unsigned long flags;
+	int i;
 
-	raw_spin_lock_irqsave(&pool_lock, flags);
-	while (obj_pool_free > ODEBUG_POOL_SIZE) {
-		obj = hlist_entry(obj_pool.first, typeof(*obj), node);
-		hlist_del(&obj->node);
-		obj_pool_free--;
+	if (!raw_spin_trylock_irqsave(&pool_lock, flags))
+		return;
+	while (obj_pool_free >= debug_objects_pool_size + ODEBUG_FREE_BATCH) {
+		for (i = 0; i < ODEBUG_FREE_BATCH; i++) {
+			objs[i] = hlist_entry(obj_pool.first,
+					      typeof(*objs[0]), node);
+			hlist_del(&objs[i]->node);
+		}
+
+		obj_pool_free -= ODEBUG_FREE_BATCH;
+		debug_objects_freed += ODEBUG_FREE_BATCH;
 		/*
 		 * We release pool_lock across kmem_cache_free() to
 		 * avoid contention on pool_lock.
 		 */
 		raw_spin_unlock_irqrestore(&pool_lock, flags);
-		kmem_cache_free(obj_cache, obj);
-		raw_spin_lock_irqsave(&pool_lock, flags);
+		for (i = 0; i < ODEBUG_FREE_BATCH; i++)
+			kmem_cache_free(obj_cache, objs[i]);
+		if (!raw_spin_trylock_irqsave(&pool_lock, flags))
+			return;
 	}
 	raw_spin_unlock_irqrestore(&pool_lock, flags);
 }
@@ -198,8 +223,8 @@ static void free_object(struct debug_obj *obj)
 	 * schedule work when the pool is filled and the cache is
 	 * initialized:
 	 */
-	if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache)
-		sched = keventd_up();
+	if (obj_pool_free > debug_objects_pool_size && obj_cache)
+		sched = 1;
 	hlist_add_head(&obj->node, &obj_pool);
 	obj_pool_free++;
 	obj_pool_used--;
@@ -362,6 +387,7 @@ void debug_object_init(void *addr, struct debug_obj_descr *descr)
 
 	__debug_object_init(addr, descr, 0);
 }
+EXPORT_SYMBOL_GPL(debug_object_init);
 
 /**
  * debug_object_init_on_stack - debug checks when an object on stack is
@@ -376,6 +402,7 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
 
 	__debug_object_init(addr, descr, 1);
 }
+EXPORT_SYMBOL_GPL(debug_object_init_on_stack);
 
 /**
  * debug_object_activate - debug checks when an object is activated
@@ -449,6 +476,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(debug_object_activate);
 
 /**
  * debug_object_deactivate - debug checks when an object is deactivated
@@ -496,6 +524,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
+EXPORT_SYMBOL_GPL(debug_object_deactivate);
 
 /**
  * debug_object_destroy - debug checks when an object is destroyed
@@ -542,6 +571,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
 out_unlock:
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
+EXPORT_SYMBOL_GPL(debug_object_destroy);
 
 /**
  * debug_object_free - debug checks when an object is freed
@@ -582,6 +612,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr)
 out_unlock:
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
+EXPORT_SYMBOL_GPL(debug_object_free);
 
 /**
  * debug_object_assert_init - debug checks when object should be init-ed
@@ -626,6 +657,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
+EXPORT_SYMBOL_GPL(debug_object_assert_init);
 
 /**
  * debug_object_active_state - debug checks object usage state machine
@@ -673,6 +705,7 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr,
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 }
+EXPORT_SYMBOL_GPL(debug_object_active_state);
 
 #ifdef CONFIG_DEBUG_OBJECTS_FREE
 static void __debug_check_no_obj_freed(const void *address, unsigned long size)
@@ -750,6 +783,8 @@ static int debug_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free);
 	seq_printf(m, "pool_used     :%d\n", obj_pool_used);
 	seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used);
+	seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated);
+	seq_printf(m, "objs_freed    :%d\n", debug_objects_freed);
 	return 0;
 }
 
@@ -1108,4 +1143,11 @@ void __init debug_objects_mem_init(void)
 		pr_warn("out of memory.\n");
 	} else
 		debug_objects_selftest();
+
+	/*
+	 * Increase the thresholds for allocating and freeing objects
+	 * according to the number of possible CPUs available in the system.
+	 */
+	debug_objects_pool_size += num_possible_cpus() * 32;
+	debug_objects_pool_min_level += num_possible_cpus() * 4;
 }
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 06f02f6aecd2..60c57ec936db 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -44,6 +44,7 @@ enum {
 	dma_debug_page,
 	dma_debug_sg,
 	dma_debug_coherent,
+	dma_debug_resource,
 };
 
 enum map_err_types {
@@ -151,8 +152,9 @@ static const char *const maperr2str[] = {
 	[MAP_ERR_CHECKED] = "dma map error checked",
 };
 
-static const char *type2name[4] = { "single", "page",
-				    "scather-gather", "coherent" };
+static const char *type2name[5] = { "single", "page",
+				    "scather-gather", "coherent",
+				    "resource" };
 
 static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
 				   "DMA_FROM_DEVICE", "DMA_NONE" };
@@ -400,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry)
 
 static unsigned long long phys_addr(struct dma_debug_entry *entry)
 {
+	if (entry->type == dma_debug_resource)
+		return __pfn_to_phys(entry->pfn) + entry->offset;
+
 	return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset;
 }
 
@@ -1150,6 +1155,11 @@ static void check_unmap(struct dma_debug_entry *ref)
 			   dir2name[ref->direction]);
 	}
 
+	/*
+	 * Drivers should use dma_mapping_error() to check the returned
+	 * addresses of dma_map_single() and dma_map_page().
+	 * If not, print this warning message. See Documentation/DMA-API.txt.
+	 */
 	if (entry->map_err_type == MAP_ERR_NOT_CHECKED) {
 		err_printk(ref->dev, entry,
 			   "DMA-API: device driver failed to check map error"
@@ -1519,6 +1529,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
 }
 EXPORT_SYMBOL(debug_dma_free_coherent);
 
+void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
+			    int direction, dma_addr_t dma_addr)
+{
+	struct dma_debug_entry *entry;
+
+	if (unlikely(dma_debug_disabled()))
+		return;
+
+	entry = dma_entry_alloc();
+	if (!entry)
+		return;
+
+	entry->type		= dma_debug_resource;
+	entry->dev		= dev;
+	entry->pfn		= PHYS_PFN(addr);
+	entry->offset		= offset_in_page(addr);
+	entry->size		= size;
+	entry->dev_addr		= dma_addr;
+	entry->direction	= direction;
+	entry->map_err_type	= MAP_ERR_NOT_CHECKED;
+
+	add_dma_entry(entry);
+}
+EXPORT_SYMBOL(debug_dma_map_resource);
+
+void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
+			      size_t size, int direction)
+{
+	struct dma_debug_entry ref = {
+		.type           = dma_debug_resource,
+		.dev            = dev,
+		.dev_addr       = dma_addr,
+		.size           = size,
+		.direction      = direction,
+	};
+
+	if (unlikely(dma_debug_disabled()))
+		return;
+
+	check_unmap(&ref);
+}
+EXPORT_SYMBOL(debug_dma_unmap_resource);
+
 void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
 				   size_t size, int direction)
 {
diff --git a/lib/extable.c b/lib/extable.c
index 0be02ad561e9..62968daa66a9 100644
--- a/lib/extable.c
+++ b/lib/extable.c
@@ -12,7 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sort.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #ifndef ARCH_HAS_RELATIVE_EXTABLE
 #define ex_to_insn(x)	((x)->insn)
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 0a1139644d32..144fe6b1a03e 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 	struct gen_pool_chunk *chunk;
 	unsigned long addr = 0;
 	int order = pool->min_alloc_order;
-	int nbits, start_bit = 0, end_bit, remain;
+	int nbits, start_bit, end_bit, remain;
 
 #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
 	BUG_ON(in_nmi());
@@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size,
 		if (size > atomic_read(&chunk->avail))
 			continue;
 
+		start_bit = 0;
 		end_bit = chunk_size(chunk) >> order;
 retry:
 		start_bit = algo(chunk->bits, end_bit, start_bit,
diff --git a/lib/halfmd4.c b/lib/halfmd4.c
deleted file mode 100644
index 137e861d9690..000000000000
--- a/lib/halfmd4.c
+++ /dev/null
@@ -1,67 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/cryptohash.h>
-#include <linux/bitops.h>
-
-/* F, G and H are basic MD4 functions: selection, majority, parity */
-#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
-#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
-#define H(x, y, z) ((x) ^ (y) ^ (z))
-
-/*
- * The generic round function.  The application is so specific that
- * we don't bother protecting all the arguments with parens, as is generally
- * good macro practice, in favor of extra legibility.
- * Rotation is separate from addition to prevent recomputation
- */
-#define ROUND(f, a, b, c, d, x, s)	\
-	(a += f(b, c, d) + x, a = rol32(a, s))
-#define K1 0
-#define K2 013240474631UL
-#define K3 015666365641UL
-
-/*
- * Basic cut-down MD4 transform.  Returns only 32 bits of result.
- */
-__u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
-{
-	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
-
-	/* Round 1 */
-	ROUND(F, a, b, c, d, in[0] + K1,  3);
-	ROUND(F, d, a, b, c, in[1] + K1,  7);
-	ROUND(F, c, d, a, b, in[2] + K1, 11);
-	ROUND(F, b, c, d, a, in[3] + K1, 19);
-	ROUND(F, a, b, c, d, in[4] + K1,  3);
-	ROUND(F, d, a, b, c, in[5] + K1,  7);
-	ROUND(F, c, d, a, b, in[6] + K1, 11);
-	ROUND(F, b, c, d, a, in[7] + K1, 19);
-
-	/* Round 2 */
-	ROUND(G, a, b, c, d, in[1] + K2,  3);
-	ROUND(G, d, a, b, c, in[3] + K2,  5);
-	ROUND(G, c, d, a, b, in[5] + K2,  9);
-	ROUND(G, b, c, d, a, in[7] + K2, 13);
-	ROUND(G, a, b, c, d, in[0] + K2,  3);
-	ROUND(G, d, a, b, c, in[2] + K2,  5);
-	ROUND(G, c, d, a, b, in[4] + K2,  9);
-	ROUND(G, b, c, d, a, in[6] + K2, 13);
-
-	/* Round 3 */
-	ROUND(H, a, b, c, d, in[3] + K3,  3);
-	ROUND(H, d, a, b, c, in[7] + K3,  9);
-	ROUND(H, c, d, a, b, in[2] + K3, 11);
-	ROUND(H, b, c, d, a, in[6] + K3, 15);
-	ROUND(H, a, b, c, d, in[1] + K3,  3);
-	ROUND(H, d, a, b, c, in[5] + K3,  9);
-	ROUND(H, c, d, a, b, in[0] + K3, 11);
-	ROUND(H, b, c, d, a, in[4] + K3, 15);
-
-	buf[0] += a;
-	buf[1] += b;
-	buf[2] += c;
-	buf[3] += d;
-
-	return buf[1]; /* "most hashed" word */
-}
-EXPORT_SYMBOL(half_md4_transform);
diff --git a/lib/idr.c b/lib/idr.c
index 6098336df267..52d2979a05e8 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
  * and go back to the ida_pre_get() call.  If the ida is full, it will
  * return %-ENOSPC.
  *
+ * Note that callers must ensure that concurrent access to @ida is not possible.
+ * See ida_simple_get() for a varaint which takes care of locking.
+ *
  * @p_id returns a value in the range @starting_id ... %0x7fffffff.
  */
 int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
  * Allocates an id in the range start <= id < end, or returns -ENOSPC.
  * On memory allocation failure, returns -ENOMEM.
  *
+ * Compared to ida_get_new_above() this function does its own locking, and
+ * should be used unless there are special requirements.
+ *
  * Use ida_simple_remove() to get rid of an id.
  */
 int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
  * ida_simple_remove - remove an allocated id.
  * @ida: the (initialized) ida.
  * @id: the id returned by ida_simple_get.
+ *
+ * Use to release an id allocated with ida_simple_get().
+ *
+ * Compared to ida_remove() this function does its own locking, and should be
+ * used unless there are special requirements.
  */
 void ida_simple_remove(struct ida *ida, unsigned int id)
 {
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 86c8911b0e3a..a3e14ce92a56 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(ioremap_page_range);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 7e3138cfc8c9..e68604ae3ced 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1,10 +1,14 @@
 #include <linux/export.h>
+#include <linux/bvec.h>
 #include <linux/uio.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/splice.h>
 #include <net/checksum.h>
 
+#define PIPE_PARANOIA /* for now */
+
 #define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
 	size_t left;					\
 	size_t wanted = n;				\
@@ -69,19 +73,21 @@
 }
 
 #define iterate_all_kinds(i, n, v, I, B, K) {			\
-	size_t skip = i->iov_offset;				\
-	if (unlikely(i->type & ITER_BVEC)) {			\
-		struct bio_vec v;				\
-		struct bvec_iter __bi;				\
-		iterate_bvec(i, n, v, __bi, skip, (B))		\
-	} else if (unlikely(i->type & ITER_KVEC)) {		\
-		const struct kvec *kvec;			\
-		struct kvec v;					\
-		iterate_kvec(i, n, v, kvec, skip, (K))		\
-	} else {						\
-		const struct iovec *iov;			\
-		struct iovec v;					\
-		iterate_iovec(i, n, v, iov, skip, (I))		\
+	if (likely(n)) {					\
+		size_t skip = i->iov_offset;			\
+		if (unlikely(i->type & ITER_BVEC)) {		\
+			struct bio_vec v;			\
+			struct bvec_iter __bi;			\
+			iterate_bvec(i, n, v, __bi, skip, (B))	\
+		} else if (unlikely(i->type & ITER_KVEC)) {	\
+			const struct kvec *kvec;		\
+			struct kvec v;				\
+			iterate_kvec(i, n, v, kvec, skip, (K))	\
+		} else {					\
+			const struct iovec *iov;		\
+			struct iovec v;				\
+			iterate_iovec(i, n, v, iov, skip, (I))	\
+		}						\
 	}							\
 }
 
@@ -290,6 +296,93 @@ done:
 	return wanted - bytes;
 }
 
+#ifdef PIPE_PARANOIA
+static bool sanity(const struct iov_iter *i)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	int idx = i->idx;
+	int next = pipe->curbuf + pipe->nrbufs;
+	if (i->iov_offset) {
+		struct pipe_buffer *p;
+		if (unlikely(!pipe->nrbufs))
+			goto Bad;	// pipe must be non-empty
+		if (unlikely(idx != ((next - 1) & (pipe->buffers - 1))))
+			goto Bad;	// must be at the last buffer...
+
+		p = &pipe->bufs[idx];
+		if (unlikely(p->offset + p->len != i->iov_offset))
+			goto Bad;	// ... at the end of segment
+	} else {
+		if (idx != (next & (pipe->buffers - 1)))
+			goto Bad;	// must be right after the last buffer
+	}
+	return true;
+Bad:
+	printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset);
+	printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n",
+			pipe->curbuf, pipe->nrbufs, pipe->buffers);
+	for (idx = 0; idx < pipe->buffers; idx++)
+		printk(KERN_ERR "[%p %p %d %d]\n",
+			pipe->bufs[idx].ops,
+			pipe->bufs[idx].page,
+			pipe->bufs[idx].offset,
+			pipe->bufs[idx].len);
+	WARN_ON(1);
+	return false;
+}
+#else
+#define sanity(i) true
+#endif
+
+static inline int next_idx(int idx, struct pipe_inode_info *pipe)
+{
+	return (idx + 1) & (pipe->buffers - 1);
+}
+
+static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
+			 struct iov_iter *i)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	struct pipe_buffer *buf;
+	size_t off;
+	int idx;
+
+	if (unlikely(bytes > i->count))
+		bytes = i->count;
+
+	if (unlikely(!bytes))
+		return 0;
+
+	if (!sanity(i))
+		return 0;
+
+	off = i->iov_offset;
+	idx = i->idx;
+	buf = &pipe->bufs[idx];
+	if (off) {
+		if (offset == off && buf->page == page) {
+			/* merge with the last one */
+			buf->len += bytes;
+			i->iov_offset += bytes;
+			goto out;
+		}
+		idx = next_idx(idx, pipe);
+		buf = &pipe->bufs[idx];
+	}
+	if (idx == pipe->curbuf && pipe->nrbufs)
+		return 0;
+	pipe->nrbufs++;
+	buf->ops = &page_cache_pipe_buf_ops;
+	get_page(buf->page = page);
+	buf->offset = offset;
+	buf->len = bytes;
+	i->iov_offset = offset + bytes;
+	i->idx = idx;
+out:
+	i->count -= bytes;
+	return bytes;
+}
+
 /*
  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
  * bytes.  For each iovec, fault in each page that constitutes the iovec.
@@ -306,8 +399,7 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 
 	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
 		iterate_iovec(i, bytes, v, iov, skip, ({
-			err = fault_in_multipages_readable(v.iov_base,
-					v.iov_len);
+			err = fault_in_pages_readable(v.iov_base, v.iov_len);
 			if (unlikely(err))
 			return err;
 		0;}))
@@ -356,9 +448,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
 	kunmap_atomic(addr);
 }
 
+static inline bool allocated(struct pipe_buffer *buf)
+{
+	return buf->ops == &default_pipe_buf_ops;
+}
+
+static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
+{
+	size_t off = i->iov_offset;
+	int idx = i->idx;
+	if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
+		idx = next_idx(idx, i->pipe);
+		off = 0;
+	}
+	*idxp = idx;
+	*offp = off;
+}
+
+static size_t push_pipe(struct iov_iter *i, size_t size,
+			int *idxp, size_t *offp)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	size_t off;
+	int idx;
+	ssize_t left;
+
+	if (unlikely(size > i->count))
+		size = i->count;
+	if (unlikely(!size))
+		return 0;
+
+	left = size;
+	data_start(i, &idx, &off);
+	*idxp = idx;
+	*offp = off;
+	if (off) {
+		left -= PAGE_SIZE - off;
+		if (left <= 0) {
+			pipe->bufs[idx].len += size;
+			return size;
+		}
+		pipe->bufs[idx].len = PAGE_SIZE;
+		idx = next_idx(idx, pipe);
+	}
+	while (idx != pipe->curbuf || !pipe->nrbufs) {
+		struct page *page = alloc_page(GFP_USER);
+		if (!page)
+			break;
+		pipe->nrbufs++;
+		pipe->bufs[idx].ops = &default_pipe_buf_ops;
+		pipe->bufs[idx].page = page;
+		pipe->bufs[idx].offset = 0;
+		if (left <= PAGE_SIZE) {
+			pipe->bufs[idx].len = left;
+			return size;
+		}
+		pipe->bufs[idx].len = PAGE_SIZE;
+		left -= PAGE_SIZE;
+		idx = next_idx(idx, pipe);
+	}
+	return size - left;
+}
+
+static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
+				struct iov_iter *i)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	size_t n, off;
+	int idx;
+
+	if (!sanity(i))
+		return 0;
+
+	bytes = n = push_pipe(i, bytes, &idx, &off);
+	if (unlikely(!n))
+		return 0;
+	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+		memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
+		i->idx = idx;
+		i->iov_offset = off + chunk;
+		n -= chunk;
+		addr += chunk;
+	}
+	i->count -= bytes;
+	return bytes;
+}
+
 size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
 {
 	const char *from = addr;
+	if (unlikely(i->type & ITER_PIPE))
+		return copy_pipe_to_iter(addr, bytes, i);
 	iterate_and_advance(i, bytes, v,
 		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
 			       v.iov_len),
@@ -374,6 +555,10 @@ EXPORT_SYMBOL(copy_to_iter);
 size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
 	iterate_and_advance(i, bytes, v,
 		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
 				 v.iov_len),
@@ -386,9 +571,38 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(copy_from_iter);
 
+bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
+{
+	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))
+		return false;
+
+	iterate_all_kinds(i, bytes, v, ({
+		if (__copy_from_user((to += v.iov_len) - v.iov_len,
+				      v.iov_base, v.iov_len))
+			return false;
+		0;}),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len),
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+	)
+
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(copy_from_iter_full);
+
 size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 {
 	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
 	iterate_and_advance(i, bytes, v,
 		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
 					 v.iov_base, v.iov_len),
@@ -401,6 +615,30 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 }
 EXPORT_SYMBOL(copy_from_iter_nocache);
 
+bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
+{
+	char *to = addr;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))
+		return false;
+	iterate_all_kinds(i, bytes, v, ({
+		if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+					     v.iov_base, v.iov_len))
+			return false;
+		0;}),
+		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
+				 v.bv_offset, v.bv_len),
+		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
+	)
+
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(copy_from_iter_full_nocache);
+
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
@@ -409,14 +647,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
 		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
 		kunmap_atomic(kaddr);
 		return wanted;
-	} else
+	} else if (likely(!(i->type & ITER_PIPE)))
 		return copy_page_to_iter_iovec(page, offset, bytes, i);
+	else
+		return copy_page_to_iter_pipe(page, offset, bytes, i);
 }
 EXPORT_SYMBOL(copy_page_to_iter);
 
 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 			 struct iov_iter *i)
 {
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
 	if (i->type & (ITER_BVEC|ITER_KVEC)) {
 		void *kaddr = kmap_atomic(page);
 		size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
@@ -427,8 +671,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
 }
 EXPORT_SYMBOL(copy_page_from_iter);
 
+static size_t pipe_zero(size_t bytes, struct iov_iter *i)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	size_t n, off;
+	int idx;
+
+	if (!sanity(i))
+		return 0;
+
+	bytes = n = push_pipe(i, bytes, &idx, &off);
+	if (unlikely(!n))
+		return 0;
+
+	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
+		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
+		memzero_page(pipe->bufs[idx].page, off, chunk);
+		i->idx = idx;
+		i->iov_offset = off + chunk;
+		n -= chunk;
+	}
+	i->count -= bytes;
+	return bytes;
+}
+
 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
 {
+	if (unlikely(i->type & ITER_PIPE))
+		return pipe_zero(bytes, i);
 	iterate_and_advance(i, bytes, v,
 		__clear_user(v.iov_base, v.iov_len),
 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -443,6 +713,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes)
 {
 	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
+	if (unlikely(i->type & ITER_PIPE)) {
+		kunmap_atomic(kaddr);
+		WARN_ON(1);
+		return 0;
+	}
 	iterate_all_kinds(i, bytes, v,
 		__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
 					  v.iov_base, v.iov_len),
@@ -455,8 +730,58 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 }
 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
+static inline void pipe_truncate(struct iov_iter *i)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	if (pipe->nrbufs) {
+		size_t off = i->iov_offset;
+		int idx = i->idx;
+		int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1);
+		if (off) {
+			pipe->bufs[idx].len = off - pipe->bufs[idx].offset;
+			idx = next_idx(idx, pipe);
+			nrbufs++;
+		}
+		while (pipe->nrbufs > nrbufs) {
+			pipe_buf_release(pipe, &pipe->bufs[idx]);
+			idx = next_idx(idx, pipe);
+			pipe->nrbufs--;
+		}
+	}
+}
+
+static void pipe_advance(struct iov_iter *i, size_t size)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	if (unlikely(i->count < size))
+		size = i->count;
+	if (size) {
+		struct pipe_buffer *buf;
+		size_t off = i->iov_offset, left = size;
+		int idx = i->idx;
+		if (off) /* make it relative to the beginning of buffer */
+			left += off - pipe->bufs[idx].offset;
+		while (1) {
+			buf = &pipe->bufs[idx];
+			if (left <= buf->len)
+				break;
+			left -= buf->len;
+			idx = next_idx(idx, pipe);
+		}
+		i->idx = idx;
+		i->iov_offset = buf->offset + left;
+	}
+	i->count -= size;
+	/* ... and discard everything past that point */
+	pipe_truncate(i);
+}
+
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
+	if (unlikely(i->type & ITER_PIPE)) {
+		pipe_advance(i, size);
+		return;
+	}
 	iterate_and_advance(i, size, v, 0, 0, 0)
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -466,6 +791,8 @@ EXPORT_SYMBOL(iov_iter_advance);
  */
 size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
+	if (unlikely(i->type & ITER_PIPE))
+		return i->count;	// it is a silly place, anyway
 	if (i->nr_segs == 1)
 		return i->count;
 	else if (i->type & ITER_BVEC)
@@ -501,14 +828,30 @@ void iov_iter_bvec(struct iov_iter *i, int direction,
 }
 EXPORT_SYMBOL(iov_iter_bvec);
 
+void iov_iter_pipe(struct iov_iter *i, int direction,
+			struct pipe_inode_info *pipe,
+			size_t count)
+{
+	BUG_ON(direction != ITER_PIPE);
+	WARN_ON(pipe->nrbufs == pipe->buffers);
+	i->type = direction;
+	i->pipe = pipe;
+	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+	i->iov_offset = 0;
+	i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_pipe);
+
 unsigned long iov_iter_alignment(const struct iov_iter *i)
 {
 	unsigned long res = 0;
 	size_t size = i->count;
 
-	if (!size)
-		return 0;
-
+	if (unlikely(i->type & ITER_PIPE)) {
+		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
+			return size | i->iov_offset;
+		return size;
+	}
 	iterate_all_kinds(i, size, v,
 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
 		res |= v.bv_offset | v.bv_len,
@@ -520,10 +863,13 @@ EXPORT_SYMBOL(iov_iter_alignment);
 
 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 {
-        unsigned long res = 0;
+	unsigned long res = 0;
 	size_t size = i->count;
-	if (!size)
-		return 0;
+
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return ~0U;
+	}
 
 	iterate_all_kinds(i, size, v,
 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
@@ -533,20 +879,63 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
 			(size != v.iov_len ? size : 0))
 		);
-		return res;
+	return res;
 }
 EXPORT_SYMBOL(iov_iter_gap_alignment);
 
-ssize_t iov_iter_get_pages(struct iov_iter *i,
+static inline size_t __pipe_get_pages(struct iov_iter *i,
+				size_t maxsize,
+				struct page **pages,
+				int idx,
+				size_t *start)
+{
+	struct pipe_inode_info *pipe = i->pipe;
+	ssize_t n = push_pipe(i, maxsize, &idx, start);
+	if (!n)
+		return -EFAULT;
+
+	maxsize = n;
+	n += *start;
+	while (n > 0) {
+		get_page(*pages++ = pipe->bufs[idx].page);
+		idx = next_idx(idx, pipe);
+		n -= PAGE_SIZE;
+	}
+
+	return maxsize;
+}
+
+static ssize_t pipe_get_pages(struct iov_iter *i,
 		   struct page **pages, size_t maxsize, unsigned maxpages,
 		   size_t *start)
 {
-	if (maxsize > i->count)
-		maxsize = i->count;
+	unsigned npages;
+	size_t capacity;
+	int idx;
 
 	if (!maxsize)
 		return 0;
 
+	if (!sanity(i))
+		return -EFAULT;
+
+	data_start(i, &idx, start);
+	/* some of this one + all after this one */
+	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+	capacity = min(npages,maxpages) * PAGE_SIZE - *start;
+
+	return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
+}
+
+ssize_t iov_iter_get_pages(struct iov_iter *i,
+		   struct page **pages, size_t maxsize, unsigned maxpages,
+		   size_t *start)
+{
+	if (maxsize > i->count)
+		maxsize = i->count;
+
+	if (unlikely(i->type & ITER_PIPE))
+		return pipe_get_pages(i, pages, maxsize, maxpages, start);
 	iterate_all_kinds(i, maxsize, v, ({
 		unsigned long addr = (unsigned long)v.iov_base;
 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -582,6 +971,40 @@ static struct page **get_pages_array(size_t n)
 	return p;
 }
 
+static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
+		   struct page ***pages, size_t maxsize,
+		   size_t *start)
+{
+	struct page **p;
+	size_t n;
+	int idx;
+	int npages;
+
+	if (!maxsize)
+		return 0;
+
+	if (!sanity(i))
+		return -EFAULT;
+
+	data_start(i, &idx, start);
+	/* some of this one + all after this one */
+	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
+	n = npages * PAGE_SIZE - *start;
+	if (maxsize > n)
+		maxsize = n;
+	else
+		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
+	p = get_pages_array(npages);
+	if (!p)
+		return -ENOMEM;
+	n = __pipe_get_pages(i, maxsize, p, idx, start);
+	if (n > 0)
+		*pages = p;
+	else
+		kvfree(p);
+	return n;
+}
+
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 		   struct page ***pages, size_t maxsize,
 		   size_t *start)
@@ -591,9 +1014,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
 	if (maxsize > i->count)
 		maxsize = i->count;
 
-	if (!maxsize)
-		return 0;
-
+	if (unlikely(i->type & ITER_PIPE))
+		return pipe_get_pages_alloc(i, pages, maxsize, start);
 	iterate_all_kinds(i, maxsize, v, ({
 		unsigned long addr = (unsigned long)v.iov_base;
 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -635,9 +1057,13 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 	__wsum sum, next;
 	size_t off = 0;
 	sum = *csum;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return 0;
+	}
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
-		next = csum_and_copy_from_user(v.iov_base, 
+		next = csum_and_copy_from_user(v.iov_base,
 					       (to += v.iov_len) - v.iov_len,
 					       v.iov_len, 0, &err);
 		if (!err) {
@@ -666,6 +1092,51 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter);
 
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
+			       struct iov_iter *i)
+{
+	char *to = addr;
+	__wsum sum, next;
+	size_t off = 0;
+	sum = *csum;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return false;
+	}
+	if (unlikely(i->count < bytes))
+		return false;
+	iterate_all_kinds(i, bytes, v, ({
+		int err = 0;
+		next = csum_and_copy_from_user(v.iov_base,
+					       (to += v.iov_len) - v.iov_len,
+					       v.iov_len, 0, &err);
+		if (err)
+			return false;
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+		0;
+	}), ({
+		char *p = kmap_atomic(v.bv_page);
+		next = csum_partial_copy_nocheck(p + v.bv_offset,
+						 (to += v.bv_len) - v.bv_len,
+						 v.bv_len, 0);
+		kunmap_atomic(p);
+		sum = csum_block_add(sum, next, off);
+		off += v.bv_len;
+	}),({
+		next = csum_partial_copy_nocheck(v.iov_base,
+						 (to += v.iov_len) - v.iov_len,
+						 v.iov_len, 0);
+		sum = csum_block_add(sum, next, off);
+		off += v.iov_len;
+	})
+	)
+	*csum = sum;
+	iov_iter_advance(i, bytes);
+	return true;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
 			     struct iov_iter *i)
 {
@@ -673,10 +1144,14 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
 	__wsum sum, next;
 	size_t off = 0;
 	sum = *csum;
+	if (unlikely(i->type & ITER_PIPE)) {
+		WARN_ON(1);	/* for now */
+		return 0;
+	}
 	iterate_and_advance(i, bytes, v, ({
 		int err = 0;
 		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
-					     v.iov_base, 
+					     v.iov_base,
 					     v.iov_len, 0, &err);
 		if (!err) {
 			sum = csum_block_add(sum, next, off);
@@ -712,7 +1187,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
 	if (!size)
 		return 0;
 
-	iterate_all_kinds(i, size, v, ({
+	if (unlikely(i->type & ITER_PIPE)) {
+		struct pipe_inode_info *pipe = i->pipe;
+		size_t off;
+		int idx;
+
+		if (!sanity(i))
+			return 0;
+
+		data_start(i, &idx, &off);
+		/* some of this one + all after this one */
+		npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
+		if (npages >= maxpages)
+			return maxpages;
+	} else iterate_all_kinds(i, size, v, ({
 		unsigned long p = (unsigned long)v.iov_base;
 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
 			- p / PAGE_SIZE;
@@ -737,6 +1225,10 @@ EXPORT_SYMBOL(iov_iter_npages);
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 {
 	*new = *old;
+	if (unlikely(new->type & ITER_PIPE)) {
+		WARN_ON(1);
+		return NULL;
+	}
 	if (new->type & ITER_BVEC)
 		return new->bvec = kmemdup(new->bvec,
 				    new->nr_segs * sizeof(struct bio_vec),
@@ -749,6 +1241,28 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
 }
 EXPORT_SYMBOL(dup_iter);
 
+/**
+ * import_iovec() - Copy an array of &struct iovec from userspace
+ *     into the kernel, check that it is valid, and initialize a new
+ *     &struct iov_iter iterator to access it.
+ *
+ * @type: One of %READ or %WRITE.
+ * @uvector: Pointer to the userspace array.
+ * @nr_segs: Number of elements in userspace array.
+ * @fast_segs: Number of elements in @iov.
+ * @iov: (input and output parameter) Pointer to pointer to (usually small
+ *     on-stack) kernel array.
+ * @i: Pointer to iterator that will be initialized on success.
+ *
+ * If the array pointed to by *@iov is large enough to hold all @nr_segs,
+ * then this function places %NULL in *@iov on return. Otherwise, a new
+ * array will be allocated and the result placed in *@iov. This means that
+ * the caller may call kfree() on *@iov regardless of whether the small
+ * on-stack array was used or not (and regardless of whether this function
+ * returns an error or not).
+ *
+ * Return: 0 on success or negative error code on error.
+ */
 int import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i)
diff --git a/lib/irq_poll.c b/lib/irq_poll.c
index 2be55692aa43..1d6565e81030 100644
--- a/lib/irq_poll.c
+++ b/lib/irq_poll.c
@@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop)
 }
 EXPORT_SYMBOL(irq_poll_complete);
 
-static void irq_poll_softirq(struct softirq_action *h)
+static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
 {
 	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
 	int rearm = 0, budget = irq_poll_budget;
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index f6c2c1e7779c..9a2b811966eb 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -56,7 +56,7 @@ static const char *kobject_actions[] = {
  * kobject_action_type - translate action string to numeric type
  *
  * @buf: buffer containing the action string, newline is ignored
- * @len: length of buffer
+ * @count: length of buffer
  * @type: pointer to the location to store the action type
  *
  * Returns 0 if the action string was recognized.
@@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info)
 /**
  * kobject_uevent_env - send an uevent with environmental data
  *
- * @action: action that is happening
  * @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
  * @envp_ext: pointer to environmental data
  *
  * Returns 0 if kobject_uevent_env() is completed with success or the
@@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env);
 /**
  * kobject_uevent - notify userspace by sending an uevent
  *
- * @action: action that is happening
  * @kobj: struct kobject that the action is happening to
+ * @action: action that is happening
  *
  * Returns 0 if kobject_uevent() is completed with success or the
  * corresponding error when it fails.
diff --git a/lib/kstrtox.c b/lib/kstrtox.c
index d8a5cf66c316..bf85e05ce858 100644
--- a/lib/kstrtox.c
+++ b/lib/kstrtox.c
@@ -17,7 +17,7 @@
 #include <linux/math64.h>
 #include <linux/export.h>
 #include <linux/types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "kstrtox.h"
 
 const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
@@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
 {
 	unsigned long long res;
 	unsigned int rv;
-	int overflow;
 
 	res = 0;
 	rv = 0;
-	overflow = 0;
 	while (*s) {
 		unsigned int val;
 
@@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
 		 */
 		if (unlikely(res & (~0ull << 60))) {
 			if (res > div_u64(ULLONG_MAX - val, base))
-				overflow = 1;
+				rv |= KSTRTOX_OVERFLOW;
 		}
 		res = res * base + val;
 		rv++;
 		s++;
 	}
 	*p = res;
-	if (overflow)
-		rv |= KSTRTOX_OVERFLOW;
 	return rv;
 }
 
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3859bf63561c..7f7bfa55eb6d 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -2,8 +2,7 @@
  * Copyright 2006, Red Hat, Inc., Dave Jones
  * Released under the General Public License (GPL).
  *
- * This file contains the linked list implementations for
- * DEBUG_LIST.
+ * This file contains the linked list validation for DEBUG_LIST.
  */
 
 #include <linux/export.h>
@@ -13,88 +12,48 @@
 #include <linux/rculist.h>
 
 /*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
+ * Check that the data structures for the list manipulations are reasonably
+ * valid. Failures here indicate memory corruption (and possibly an exploit
+ * attempt).
  */
 
-void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
+bool __list_add_valid(struct list_head *new, struct list_head *prev,
+		      struct list_head *next)
 {
-	WARN(next->prev != prev,
-		"list_add corruption. next->prev should be "
-		"prev (%p), but was %p. (next=%p).\n",
+	CHECK_DATA_CORRUPTION(next->prev != prev,
+		"list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
 		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add corruption. prev->next should be "
-		"next (%p), but was %p. (prev=%p).\n",
+	CHECK_DATA_CORRUPTION(prev->next != next,
+		"list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
 		next, prev->next, prev);
-	WARN(new == prev || new == next,
-	     "list_add double add: new=%p, prev=%p, next=%p.\n",
-	     new, prev, next);
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	WRITE_ONCE(prev->next, new);
+	CHECK_DATA_CORRUPTION(new == prev || new == next,
+		"list_add double add: new=%p, prev=%p, next=%p.\n",
+		new, prev, next);
+
+	return true;
 }
-EXPORT_SYMBOL(__list_add);
+EXPORT_SYMBOL(__list_add_valid);
 
-void __list_del_entry(struct list_head *entry)
+bool __list_del_entry_valid(struct list_head *entry)
 {
 	struct list_head *prev, *next;
 
 	prev = entry->prev;
 	next = entry->next;
 
-	if (WARN(next == LIST_POISON1,
+	CHECK_DATA_CORRUPTION(next == LIST_POISON1,
 		"list_del corruption, %p->next is LIST_POISON1 (%p)\n",
-		entry, LIST_POISON1) ||
-	    WARN(prev == LIST_POISON2,
+		entry, LIST_POISON1);
+	CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
 		"list_del corruption, %p->prev is LIST_POISON2 (%p)\n",
-		entry, LIST_POISON2) ||
-	    WARN(prev->next != entry,
-		"list_del corruption. prev->next should be %p, "
-		"but was %p\n", entry, prev->next) ||
-	    WARN(next->prev != entry,
-		"list_del corruption. next->prev should be %p, "
-		"but was %p\n", entry, next->prev))
-		return;
-
-	__list_del(prev, next);
-}
-EXPORT_SYMBOL(__list_del_entry);
+		entry, LIST_POISON2);
+	CHECK_DATA_CORRUPTION(prev->next != entry,
+		"list_del corruption. prev->next should be %p, but was %p\n",
+		entry, prev->next);
+	CHECK_DATA_CORRUPTION(next->prev != entry,
+		"list_del corruption. next->prev should be %p, but was %p\n",
+		entry, next->prev);
+	return true;
 
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-void list_del(struct list_head *entry)
-{
-	__list_del_entry(entry);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-EXPORT_SYMBOL(list_del);
-
-/*
- * RCU variants.
- */
-void __list_add_rcu(struct list_head *new,
-		    struct list_head *prev, struct list_head *next)
-{
-	WARN(next->prev != prev,
-		"list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n",
-		prev, next->prev, next);
-	WARN(prev->next != next,
-		"list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n",
-		next, prev->next, prev);
-	new->next = next;
-	new->prev = prev;
-	rcu_assign_pointer(list_next_rcu(prev), new);
-	next->prev = new;
 }
-EXPORT_SYMBOL(__list_add_rcu);
+EXPORT_SYMBOL(__list_del_entry_valid);
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 872a15a2a637..f3a217ea0388 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 #ifndef CONFIG_PROVE_LOCKING
 	if (expected == FAILURE && debug_locks) {
 		expected_testcase_failures++;
-		printk("failed|");
+		pr_cont("failed|");
 	}
 	else
 #endif
 	if (debug_locks != expected) {
 		unexpected_testcase_failures++;
-		printk("FAILED|");
+		pr_cont("FAILED|");
 
 		dump_stack();
 	} else {
 		testcase_successes++;
-		printk("  ok  |");
+		pr_cont("  ok  |");
 	}
 	testcase_total++;
 
 	if (debug_locks_verbose)
-		printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
+		pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n",
 			lockclass_mask, debug_locks, expected);
 	/*
 	 * Some tests (e.g. double-unlock) might corrupt the preemption
@@ -1021,26 +1021,26 @@ static inline void print_testname(const char *testname)
 #define DO_TESTCASE_1(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
 	dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK);		\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_1B(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
 	dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK);		\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_3(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN);	\
 	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_3RW(desc, name, nr)				\
 	print_testname(desc"/"#nr);				\
 	dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\
 	dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK);	\
 	dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK);	\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_6(desc, name)				\
 	print_testname(desc);					\
@@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname)
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_6_SUCCESS(desc, name)			\
 	print_testname(desc);					\
@@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname)
 	dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM);		\
-	printk("\n");
+	pr_cont("\n");
 
 /*
  * 'read' variant: rlocks must not trigger.
@@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname)
 	dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX);		\
 	dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM);		\
 	dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM);		\
-	printk("\n");
+	pr_cont("\n");
 
 #define DO_TESTCASE_2I(desc, name, nr)				\
 	DO_TESTCASE_1("hard-"desc, name##_hard, nr);		\
@@ -1726,25 +1726,25 @@ static void ww_tests(void)
 	dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("ww contexts mixing");
 	dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("finishing ww context");
 	dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_context_fini_early, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_context_lock_after_done, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("locking mismatches");
 	dotest(ww_test_object_unlock_twice, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_object_lock_unbalanced, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_object_lock_stale_context, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("EDEADLK handling");
 	dotest(ww_test_edeadlk_normal, SUCCESS, LOCKTYPE_WW);
@@ -1757,11 +1757,11 @@ static void ww_tests(void)
 	dotest(ww_test_edeadlk_acquire_more_edeadlk_slow, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_edeadlk_acquire_wrong, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_edeadlk_acquire_wrong_slow, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("spinlock nest unlocked");
 	dotest(ww_test_spin_nest_unlocked, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	printk("  -----------------------------------------------------\n");
 	printk("                                 |block | try  |context|\n");
@@ -1771,25 +1771,25 @@ static void ww_tests(void)
 	dotest(ww_test_context_block, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_context_context, SUCCESS, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("try");
 	dotest(ww_test_try_block, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_try_try, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("block");
 	dotest(ww_test_block_block, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_block_try, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_block_context, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("spinlock");
 	dotest(ww_test_spin_block, FAILURE, LOCKTYPE_WW);
 	dotest(ww_test_spin_try, SUCCESS, LOCKTYPE_WW);
 	dotest(ww_test_spin_context, FAILURE, LOCKTYPE_WW);
-	printk("\n");
+	pr_cont("\n");
 }
 
 void locking_selftest(void)
@@ -1829,32 +1829,32 @@ void locking_selftest(void)
 
 	printk("  --------------------------------------------------------------------------\n");
 	print_testname("recursive read-lock");
-	printk("             |");
+	pr_cont("             |");
 	dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK);
-	printk("             |");
+	pr_cont("             |");
 	dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("recursive read-lock #2");
-	printk("             |");
+	pr_cont("             |");
 	dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK);
-	printk("             |");
+	pr_cont("             |");
 	dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("mixed read-write-lock");
-	printk("             |");
+	pr_cont("             |");
 	dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK);
-	printk("             |");
+	pr_cont("             |");
 	dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM);
-	printk("\n");
+	pr_cont("\n");
 
 	print_testname("mixed write-read-lock");
-	printk("             |");
+	pr_cont("             |");
 	dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK);
-	printk("             |");
+	pr_cont("             |");
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
-	printk("\n");
+	pr_cont("\n");
 
 	printk("  --------------------------------------------------------------------------\n");
 
diff --git a/lib/lockref.c b/lib/lockref.c
index 5a92189ad711..c4bfcb8836cd 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -20,7 +20,7 @@
 		if (likely(old.lock_count == prev.lock_count)) {		\
 			SUCCESS;						\
 		}								\
-		cpu_relax_lowlatency();						\
+		cpu_relax();							\
 	}									\
 } while (0)
 
diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c
index 5464c8744ea9..e24388a863a7 100644
--- a/lib/mpi/mpi-pow.c
+++ b/lib/mpi/mpi-pow.c
@@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod)
 	if (!esize) {
 		/* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0
 		 * depending on if MOD equals 1.  */
-		rp[0] = 1;
 		res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1;
+		if (res->nlimbs) {
+			if (mpi_resize(res, 1) < 0)
+				goto enomem;
+			rp = res->d;
+			rp[0] = 1;
+		}
 		res->sign = 0;
 		goto leave;
 	}
diff --git a/lib/nlattr.c b/lib/nlattr.c
index fce1e9afc6d9..b42b8577fc23 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -14,7 +14,7 @@
 #include <linux/types.h>
 #include <net/netlink.h>
 
-static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = {
+static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
 	[NLA_U8]	= sizeof(u8),
 	[NLA_U16]	= sizeof(u16),
 	[NLA_U32]	= sizeof(u32),
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 26caf51cc238..5f7999eacad5 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,21 +16,23 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
+#include <linux/cpu.h>
 
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
 
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+/* "in progress" flag of arch_trigger_cpumask_backtrace */
 static unsigned long backtrace_flag;
 
 /*
- * When raise() is called it will be is passed a pointer to the
+ * When raise() is called it will be passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
  * directly from their raise() functions may rely on the mask
  * they are passed being updated as a side effect of this call.
  */
-void nmi_trigger_all_cpu_backtrace(bool include_self,
+void nmi_trigger_cpumask_backtrace(const cpumask_t *mask,
+				   bool exclude_self,
 				   void (*raise)(cpumask_t *mask))
 {
 	int i, this_cpu = get_cpu();
@@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 		return;
 	}
 
-	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-	if (!include_self)
+	cpumask_copy(to_cpumask(backtrace_mask), mask);
+	if (exclude_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
+	/*
+	 * Don't try to send an NMI to this cpu; it may work on some
+	 * architectures, but on others it may not, and we'll get
+	 * information at least as useful just by doing a dump_stack() here.
+	 * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit.
+	 */
+	if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask)))
+		nmi_cpu_backtrace(NULL);
+
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
-		pr_info("Sending NMI to %s CPUs:\n",
-			(include_self ? "all" : "other"));
+		pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n",
+			this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask));
 		raise(to_cpumask(backtrace_mask));
 	}
 
@@ -66,7 +77,7 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	 * Force flush any remote buffers that might be stuck in IRQ context
 	 * and therefore could not run their irq_work.
 	 */
-	printk_nmi_flush();
+	printk_safe_flush();
 
 	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
@@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs)
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		pr_warn("NMI backtrace for cpu %d\n", cpu);
-		if (regs)
-			show_regs(regs);
-		else
-			dump_stack();
+		if (regs && cpu_in_idle(instruction_pointer(regs))) {
+			pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n",
+				cpu, instruction_pointer(regs));
+		} else {
+			pr_warn("NMI backtrace for cpu %d\n", cpu);
+			if (regs)
+				show_regs(regs);
+			else
+				dump_stack();
+		}
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
diff --git a/lib/parman.c b/lib/parman.c
new file mode 100644
index 000000000000..c6e42a8db824
--- /dev/null
+++ b/lib/parman.c
@@ -0,0 +1,376 @@
+/*
+ * lib/parman.c - Manager for linear priority array areas
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/parman.h>
+
+struct parman_algo {
+	int (*item_add)(struct parman *parman, struct parman_prio *prio,
+			struct parman_item *item);
+	void (*item_remove)(struct parman *parman, struct parman_prio *prio,
+			    struct parman_item *item);
+};
+
+struct parman {
+	const struct parman_ops *ops;
+	void *priv;
+	const struct parman_algo *algo;
+	unsigned long count;
+	unsigned long limit_count;
+	struct list_head prio_list;
+};
+
+static int parman_enlarge(struct parman *parman)
+{
+	unsigned long new_count = parman->limit_count +
+				  parman->ops->resize_step;
+	int err;
+
+	err = parman->ops->resize(parman->priv, new_count);
+	if (err)
+		return err;
+	parman->limit_count = new_count;
+	return 0;
+}
+
+static int parman_shrink(struct parman *parman)
+{
+	unsigned long new_count = parman->limit_count -
+				  parman->ops->resize_step;
+	int err;
+
+	if (new_count < parman->ops->base_count)
+		return 0;
+	err = parman->ops->resize(parman->priv, new_count);
+	if (err)
+		return err;
+	parman->limit_count = new_count;
+	return 0;
+}
+
+static bool parman_prio_used(struct parman_prio *prio)
+
+{
+	return !list_empty(&prio->item_list);
+}
+
+static struct parman_item *parman_prio_first_item(struct parman_prio *prio)
+{
+	return list_first_entry(&prio->item_list,
+				typeof(struct parman_item), list);
+}
+
+static unsigned long parman_prio_first_index(struct parman_prio *prio)
+{
+	return parman_prio_first_item(prio)->index;
+}
+
+static struct parman_item *parman_prio_last_item(struct parman_prio *prio)
+{
+	return list_last_entry(&prio->item_list,
+			       typeof(struct parman_item), list);
+}
+
+static unsigned long parman_prio_last_index(struct parman_prio *prio)
+{
+	return parman_prio_last_item(prio)->index;
+}
+
+static unsigned long parman_lsort_new_index_find(struct parman *parman,
+						 struct parman_prio *prio)
+{
+	list_for_each_entry_from_reverse(prio, &parman->prio_list, list) {
+		if (!parman_prio_used(prio))
+			continue;
+		return parman_prio_last_index(prio) + 1;
+	}
+	return 0;
+}
+
+static void __parman_prio_move(struct parman *parman, struct parman_prio *prio,
+			       struct parman_item *item, unsigned long to_index,
+			       unsigned long count)
+{
+	parman->ops->move(parman->priv, item->index, to_index, count);
+}
+
+static void parman_prio_shift_down(struct parman *parman,
+				   struct parman_prio *prio)
+{
+	struct parman_item *item;
+	unsigned long to_index;
+
+	if (!parman_prio_used(prio))
+		return;
+	item = parman_prio_first_item(prio);
+	to_index = parman_prio_last_index(prio) + 1;
+	__parman_prio_move(parman, prio, item, to_index, 1);
+	list_move_tail(&item->list, &prio->item_list);
+	item->index = to_index;
+}
+
+static void parman_prio_shift_up(struct parman *parman,
+				 struct parman_prio *prio)
+{
+	struct parman_item *item;
+	unsigned long to_index;
+
+	if (!parman_prio_used(prio))
+		return;
+	item = parman_prio_last_item(prio);
+	to_index = parman_prio_first_index(prio) - 1;
+	__parman_prio_move(parman, prio, item, to_index, 1);
+	list_move(&item->list, &prio->item_list);
+	item->index = to_index;
+}
+
+static void parman_prio_item_remove(struct parman *parman,
+				    struct parman_prio *prio,
+				    struct parman_item *item)
+{
+	struct parman_item *last_item;
+	unsigned long to_index;
+
+	last_item = parman_prio_last_item(prio);
+	if (last_item == item) {
+		list_del(&item->list);
+		return;
+	}
+	to_index = item->index;
+	__parman_prio_move(parman, prio, last_item, to_index, 1);
+	list_del(&last_item->list);
+	list_replace(&item->list, &last_item->list);
+	last_item->index = to_index;
+}
+
+static int parman_lsort_item_add(struct parman *parman,
+				 struct parman_prio *prio,
+				 struct parman_item *item)
+{
+	struct parman_prio *prio2;
+	unsigned long new_index;
+	int err;
+
+	if (parman->count + 1 > parman->limit_count) {
+		err = parman_enlarge(parman);
+		if (err)
+			return err;
+	}
+
+	new_index = parman_lsort_new_index_find(parman, prio);
+	list_for_each_entry_reverse(prio2, &parman->prio_list, list) {
+		if (prio2 == prio)
+			break;
+		parman_prio_shift_down(parman, prio2);
+	}
+	item->index = new_index;
+	list_add_tail(&item->list, &prio->item_list);
+	parman->count++;
+	return 0;
+}
+
+static void parman_lsort_item_remove(struct parman *parman,
+				     struct parman_prio *prio,
+				     struct parman_item *item)
+{
+	parman_prio_item_remove(parman, prio, item);
+	list_for_each_entry_continue(prio, &parman->prio_list, list)
+		parman_prio_shift_up(parman, prio);
+	parman->count--;
+	if (parman->limit_count - parman->count >= parman->ops->resize_step)
+		parman_shrink(parman);
+}
+
+static const struct parman_algo parman_lsort = {
+	.item_add	= parman_lsort_item_add,
+	.item_remove	= parman_lsort_item_remove,
+};
+
+static const struct parman_algo *parman_algos[] = {
+	&parman_lsort,
+};
+
+/**
+ * parman_create - creates a new parman instance
+ * @ops:	caller-specific callbacks
+ * @priv:	pointer to a private data passed to the ops
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Each parman instance manages an array area with chunks of entries
+ * with the same priority. Consider following example:
+ *
+ * item 1 with prio 10
+ * item 2 with prio 10
+ * item 3 with prio 10
+ * item 4 with prio 20
+ * item 5 with prio 20
+ * item 6 with prio 30
+ * item 7 with prio 30
+ * item 8 with prio 30
+ *
+ * In this example, there are 3 priority chunks. The order of the priorities
+ * matters, however the order of items within a single priority chunk does not
+ * matter. So the same array could be ordered as follows:
+ *
+ * item 2 with prio 10
+ * item 3 with prio 10
+ * item 1 with prio 10
+ * item 5 with prio 20
+ * item 4 with prio 20
+ * item 7 with prio 30
+ * item 8 with prio 30
+ * item 6 with prio 30
+ *
+ * The goal of parman is to maintain the priority ordering. The caller
+ * provides @ops with callbacks parman uses to move the items
+ * and resize the array area.
+ *
+ * Returns a pointer to newly created parman instance in case of success,
+ * otherwise it returns NULL.
+ */
+struct parman *parman_create(const struct parman_ops *ops, void *priv)
+{
+	struct parman *parman;
+
+	parman = kzalloc(sizeof(*parman), GFP_KERNEL);
+	if (!parman)
+		return NULL;
+	INIT_LIST_HEAD(&parman->prio_list);
+	parman->ops = ops;
+	parman->priv = priv;
+	parman->limit_count = ops->base_count;
+	parman->algo = parman_algos[ops->algo];
+	return parman;
+}
+EXPORT_SYMBOL(parman_create);
+
+/**
+ * parman_destroy - destroys existing parman instance
+ * @parman:	parman instance
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_destroy(struct parman *parman)
+{
+	WARN_ON(!list_empty(&parman->prio_list));
+	kfree(parman);
+}
+EXPORT_SYMBOL(parman_destroy);
+
+/**
+ * parman_prio_init - initializes a parman priority chunk
+ * @parman:	parman instance
+ * @prio:	parman prio structure to be initialized
+ * @prority:	desired priority of the chunk
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Before caller could add an item with certain priority, he has to
+ * initialize a priority chunk for it using this function.
+ */
+void parman_prio_init(struct parman *parman, struct parman_prio *prio,
+		      unsigned long priority)
+{
+	struct parman_prio *prio2;
+	struct list_head *pos;
+
+	INIT_LIST_HEAD(&prio->item_list);
+	prio->priority = priority;
+
+	/* Position inside the list according to priority */
+	list_for_each(pos, &parman->prio_list) {
+		prio2 = list_entry(pos, typeof(*prio2), list);
+		if (prio2->priority > prio->priority)
+			break;
+	}
+	list_add_tail(&prio->list, pos);
+}
+EXPORT_SYMBOL(parman_prio_init);
+
+/**
+ * parman_prio_fini - finalizes use of parman priority chunk
+ * @prio:	parman prio structure
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_prio_fini(struct parman_prio *prio)
+{
+	WARN_ON(parman_prio_used(prio));
+	list_del(&prio->list);
+}
+EXPORT_SYMBOL(parman_prio_fini);
+
+/**
+ * parman_item_add - adds a parman item under defined priority
+ * @parman:	parman instance
+ * @prio:	parman prio instance to add the item to
+ * @item:	parman item instance
+ *
+ * Note: all locking must be provided by the caller.
+ *
+ * Adds item to a array managed by parman instance under the specified priority.
+ *
+ * Returns 0 in case of success, negative number to indicate an error.
+ */
+int parman_item_add(struct parman *parman, struct parman_prio *prio,
+		    struct parman_item *item)
+{
+	return parman->algo->item_add(parman, prio, item);
+}
+EXPORT_SYMBOL(parman_item_add);
+
+/**
+ * parman_item_del - deletes parman item
+ * @parman:	parman instance
+ * @prio:	parman prio instance to delete the item from
+ * @item:	parman item instance
+ *
+ * Note: all locking must be provided by the caller.
+ */
+void parman_item_remove(struct parman *parman, struct parman_prio *prio,
+			struct parman_item *item)
+{
+	parman->algo->item_remove(parman, prio, item);
+}
+EXPORT_SYMBOL(parman_item_remove);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Priority-based array manager");
diff --git a/lib/parser.c b/lib/parser.c
index b6d11631231b..3278958b472a 100644
--- a/lib/parser.c
+++ b/lib/parser.c
@@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base)
 }
 
 /**
+ * match_u64int: scan a number in the given base from a substring_t
+ * @s: substring to be scanned
+ * @result: resulting u64 on success
+ * @base: base to use when converting string
+ *
+ * Description: Given a &substring_t and a base, attempts to parse the substring
+ * as a number in that base. On success, sets @result to the integer represented
+ * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+static int match_u64int(substring_t *s, u64 *result, int base)
+{
+	char *buf;
+	int ret;
+	u64 val;
+	size_t len = s->to - s->from;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+	memcpy(buf, s->from, len);
+	buf[len] = '\0';
+
+	ret = kstrtoull(buf, base, &val);
+	if (!ret)
+		*result = val;
+	kfree(buf);
+	return ret;
+}
+
+/**
  * match_int: - scan a decimal representation of an integer from a substring_t
  * @s: substring_t to be scanned
  * @result: resulting integer on success
@@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result)
 EXPORT_SYMBOL(match_int);
 
 /**
+ * match_u64: - scan a decimal representation of a u64 from
+ *                  a substring_t
+ * @s: substring_t to be scanned
+ * @result: resulting unsigned long long on success
+ *
+ * Description: Attempts to parse the &substring_t @s as a long decimal
+ * integer. On success, sets @result to the integer represented by the
+ * string and returns 0.
+ * Returns -ENOMEM, -EINVAL, or -ERANGE on failure.
+ */
+int match_u64(substring_t *s, u64 *result)
+{
+	return match_u64int(s, result, 0);
+}
+EXPORT_SYMBOL(match_u64);
+
+/**
  * match_octal: - scan an octal representation of an integer from a substring_t
  * @s: substring_t to be scanned
  * @result: resulting integer on success
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 27fe74948882..9ac959ef4cae 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,6 +33,7 @@
 
 #define PERCPU_COUNT_BIAS	(1LU << (BITS_PER_LONG - 1))
 
+static DEFINE_SPINLOCK(percpu_ref_switch_lock);
 static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
 
 static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
@@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 	atomic_long_set(&ref->count, start_count);
 
 	ref->release = release;
+	ref->confirm_switch = NULL;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(percpu_ref_init);
@@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref)
 	unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
 
 	if (percpu_count) {
+		/* non-NULL confirm_switch indicates switching in progress */
+		WARN_ON_ONCE(ref->confirm_switch);
 		free_percpu(percpu_count);
 		ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
 	}
@@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
 static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 					  percpu_ref_func_t *confirm_switch)
 {
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
-		/* switching from percpu to atomic */
-		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-
-		/*
-		 * Non-NULL ->confirm_switch is used to indicate that
-		 * switching is in progress.  Use noop one if unspecified.
-		 */
-		WARN_ON_ONCE(ref->confirm_switch);
-		ref->confirm_switch =
-			confirm_switch ?: percpu_ref_noop_confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
-	} else if (confirm_switch) {
-		/*
-		 * Somebody already set ATOMIC.  Switching may still be in
-		 * progress.  @confirm_switch must be invoked after the
-		 * switching is complete and a full sched RCU grace period
-		 * has passed.  Wait synchronously for the previous
-		 * switching and schedule @confirm_switch invocation.
-		 */
-		wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-		ref->confirm_switch = confirm_switch;
-
-		percpu_ref_get(ref);	/* put after confirmation */
-		call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
+	if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) {
+		if (confirm_switch)
+			confirm_switch(ref);
+		return;
 	}
-}
 
-/**
- * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
- * @ref: percpu_ref to switch to atomic mode
- * @confirm_switch: optional confirmation callback
- *
- * There's no reason to use this function for the usual reference counting.
- * Use percpu_ref_kill[_and_confirm]().
- *
- * Schedule switching of @ref to atomic mode.  All its percpu counts will
- * be collected to the main atomic counter.  On completion, when all CPUs
- * are guaraneed to be in atomic mode, @confirm_switch, which may not
- * block, is invoked.  This function may be invoked concurrently with all
- * the get/put operations and can safely be mixed with kill and reinit
- * operations.  Note that @ref will stay in atomic mode across kill/reinit
- * cycles until percpu_ref_switch_to_percpu() is called.
- *
- * This function normally doesn't block and can be called from any context
- * but it may block if @confirm_kill is specified and @ref is already in
- * the process of switching to atomic mode.  In such cases, @confirm_switch
- * will be invoked after the switching is complete.
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
- */
-void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
-				 percpu_ref_func_t *confirm_switch)
-{
-	ref->force_atomic = true;
-	__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	/* switching from percpu to atomic */
+	ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
+
+	/*
+	 * Non-NULL ->confirm_switch is used to indicate that switching is
+	 * in progress.  Use noop one if unspecified.
+	 */
+	ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch;
+
+	percpu_ref_get(ref);	/* put after confirmation */
+	call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
 }
 
 static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
@@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
-	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
-
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
@@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 			  ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
 }
 
+static void __percpu_ref_switch_mode(struct percpu_ref *ref,
+				     percpu_ref_func_t *confirm_switch)
+{
+	lockdep_assert_held(&percpu_ref_switch_lock);
+
+	/*
+	 * If the previous ATOMIC switching hasn't finished yet, wait for
+	 * its completion.  If the caller ensures that ATOMIC switching
+	 * isn't in progress, this function can be called from any context.
+	 */
+	wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch,
+			    percpu_ref_switch_lock);
+
+	if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD))
+		__percpu_ref_switch_to_atomic(ref, confirm_switch);
+	else
+		__percpu_ref_switch_to_percpu(ref);
+}
+
+/**
+ * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ * @confirm_switch: optional confirmation callback
+ *
+ * There's no reason to use this function for the usual reference counting.
+ * Use percpu_ref_kill[_and_confirm]().
+ *
+ * Schedule switching of @ref to atomic mode.  All its percpu counts will
+ * be collected to the main atomic counter.  On completion, when all CPUs
+ * are guaraneed to be in atomic mode, @confirm_switch, which may not
+ * block, is invoked.  This function may be invoked concurrently with all
+ * the get/put operations and can safely be mixed with kill and reinit
+ * operations.  Note that @ref will stay in atomic mode across kill/reinit
+ * cycles until percpu_ref_switch_to_percpu() is called.
+ *
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
+ */
+void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_switch)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
+	ref->force_atomic = true;
+	__percpu_ref_switch_mode(ref, confirm_switch);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
+}
+
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
  * @ref: percpu_ref to switch to percpu mode
@@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  * dying or dead, the actual switching takes place on the following
  * percpu_ref_reinit().
  *
- * This function normally doesn't block and can be called from any context
- * but it may block if @ref is in the process of switching to atomic mode
- * by percpu_ref_switch_atomic().
+ * This function may block if @ref is in the process of switching to atomic
+ * mode.  If the caller ensures that @ref is not in the process of
+ * switching to atomic mode, this function can be called from any context.
  */
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	ref->force_atomic = false;
+	__percpu_ref_switch_mode(ref, NULL);
 
-	/* a dying or dead ref can't be switched to percpu mode w/o reinit */
-	if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
-		__percpu_ref_switch_to_percpu(ref);
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 
 /**
@@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
  *
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
- * process of switching to atomic mode by percpu_ref_switch_atomic().
- *
- * Due to the way percpu_ref is implemented, @confirm_switch will be called
- * after at least one full sched RCU grace period has passed but this is an
- * implementation detail and must not be depended upon.
+ * process of switching to atomic mode by percpu_ref_switch_to_atomic().
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
 		  "%s called more than once on %pf!", __func__, ref->release);
 
 	ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
-	__percpu_ref_switch_to_atomic(ref, confirm_kill);
+	__percpu_ref_switch_mode(ref, confirm_kill);
 	percpu_ref_put(ref);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
 
@@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
  */
 void percpu_ref_reinit(struct percpu_ref *ref)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&percpu_ref_switch_lock, flags);
+
 	WARN_ON_ONCE(!percpu_ref_is_zero(ref));
 
 	ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
 	percpu_ref_get(ref);
-	if (!ref->force_atomic)
-		__percpu_ref_switch_to_percpu(ref);
+	__percpu_ref_switch_mode(ref, NULL);
+
+	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
 EXPORT_SYMBOL_GPL(percpu_ref_reinit);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 72d36113ccaa..c8cebb137076 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy);
 int percpu_counter_batch __read_mostly = 32;
 EXPORT_SYMBOL(percpu_counter_batch);
 
-static void compute_batch_value(void)
+static int compute_batch_value(unsigned int cpu)
 {
 	int nr = num_online_cpus();
 
 	percpu_counter_batch = max(32, nr*2);
+	return 0;
 }
 
-static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
-					unsigned long action, void *hcpu)
+static int percpu_counter_cpu_dead(unsigned int cpu)
 {
 #ifdef CONFIG_HOTPLUG_CPU
-	unsigned int cpu;
 	struct percpu_counter *fbc;
 
-	compute_batch_value();
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-		return NOTIFY_OK;
+	compute_batch_value(cpu);
 
-	cpu = (unsigned long)hcpu;
 	spin_lock_irq(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
 		s32 *pcount;
@@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
 	}
 	spin_unlock_irq(&percpu_counters_lock);
 #endif
-	return NOTIFY_OK;
+	return 0;
 }
 
 /*
@@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare);
 
 static int __init percpu_counter_startup(void)
 {
-	compute_batch_value();
-	hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
+	int ret;
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
+				compute_batch_value, NULL);
+	WARN_ON(ret < 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
+					"lib/percpu_cnt:dead", NULL,
+					percpu_counter_cpu_dead);
+	WARN_ON(ret < 0);
 	return 0;
 }
 module_init(percpu_counter_startup);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 91f0727e3cad..84812a9fb16f 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -22,6 +22,7 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -30,7 +31,6 @@
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/kmemleak.h>
-#include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/string.h>
 #include <linux/bitops.h>
@@ -69,6 +69,11 @@ struct radix_tree_preload {
 };
 static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
 
+static inline struct radix_tree_node *entry_to_node(void *ptr)
+{
+	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
+}
+
 static inline void *node_to_entry(void *ptr)
 {
 	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -191,13 +196,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
  * Returns next bit offset, or size if nothing found.
  */
 static __always_inline unsigned long
-radix_tree_find_next_bit(const unsigned long *addr,
-			 unsigned long size, unsigned long offset)
+radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
+			 unsigned long offset)
 {
-	if (!__builtin_constant_p(size))
-		return find_next_bit(addr, size, offset);
+	const unsigned long *addr = node->tags[tag];
 
-	if (offset < size) {
+	if (offset < RADIX_TREE_MAP_SIZE) {
 		unsigned long tmp;
 
 		addr += offset / BITS_PER_LONG;
@@ -205,14 +209,32 @@ radix_tree_find_next_bit(const unsigned long *addr,
 		if (tmp)
 			return __ffs(tmp) + offset;
 		offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
-		while (offset < size) {
+		while (offset < RADIX_TREE_MAP_SIZE) {
 			tmp = *++addr;
 			if (tmp)
 				return __ffs(tmp) + offset;
 			offset += BITS_PER_LONG;
 		}
 	}
-	return size;
+	return RADIX_TREE_MAP_SIZE;
+}
+
+static unsigned int iter_offset(const struct radix_tree_iter *iter)
+{
+	return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
+}
+
+/*
+ * The maximum index which can be stored in a radix tree
+ */
+static inline unsigned long shift_maxindex(unsigned int shift)
+{
+	return (RADIX_TREE_MAP_SIZE << shift) - 1;
+}
+
+static inline unsigned long node_maxindex(struct radix_tree_node *node)
+{
+	return shift_maxindex(node->shift);
 }
 
 #ifndef __KERNEL__
@@ -220,10 +242,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 {
 	unsigned long i;
 
-	pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n",
-		node, node->offset,
+	pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
+		node, node->offset, index, index | node_maxindex(node),
+		node->parent,
 		node->tags[0][0], node->tags[1][0], node->tags[2][0],
-		node->shift, node->count, node->parent);
+		node->shift, node->count, node->exceptional);
 
 	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
 		unsigned long first = index | (i << node->shift);
@@ -231,14 +254,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
 		void *entry = node->slots[i];
 		if (!entry)
 			continue;
-		if (is_sibling_entry(node, entry)) {
-			pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n",
-					entry, i,
-					*(void **)entry_to_node(entry),
-					first, last);
+		if (entry == RADIX_TREE_RETRY) {
+			pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
+					i, first, last, node);
 		} else if (!radix_tree_is_internal_node(entry)) {
-			pr_debug("radix entry %p offset %ld indices %ld-%ld\n",
-					entry, i, first, last);
+			pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
+					entry, i, first, last, node);
+		} else if (is_sibling_entry(node, entry)) {
+			pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
+					entry, i, first, last, node,
+					*(void **)entry_to_node(entry));
 		} else {
 			dump_node(entry_to_node(entry), first);
 		}
@@ -262,7 +287,10 @@ static void radix_tree_dump(struct radix_tree_root *root)
  * that the caller has pinned this thread of control to the current CPU.
  */
 static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root)
+radix_tree_node_alloc(struct radix_tree_root *root,
+			struct radix_tree_node *parent,
+			unsigned int shift, unsigned int offset,
+			unsigned int count, unsigned int exceptional)
 {
 	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
@@ -307,6 +335,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 out:
 	BUG_ON(radix_tree_is_internal_node(ret));
+	if (ret) {
+		ret->parent = parent;
+		ret->shift = shift;
+		ret->offset = offset;
+		ret->count = count;
+		ret->exceptional = exceptional;
+	}
 	return ret;
 }
 
@@ -314,18 +349,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
 {
 	struct radix_tree_node *node =
 			container_of(head, struct radix_tree_node, rcu_head);
-	int i;
 
 	/*
-	 * must only free zeroed nodes into the slab. radix_tree_shrink
-	 * can leave us with a non-NULL entry in the first slot, so clear
-	 * that here to make sure.
+	 * Must only free zeroed nodes into the slab.  We can be left with
+	 * non-NULL entries by radix_tree_free_nodes, so clear the entries
+	 * and tags here.
 	 */
-	for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
-		tag_clear(node, i, 0);
-
-	node->slots[0] = NULL;
-	node->count = 0;
+	memset(node->slots, 0, sizeof(node->slots));
+	memset(node->tags, 0, sizeof(node->tags));
+	INIT_LIST_HEAD(&node->private_list);
 
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
@@ -345,7 +377,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
  */
-static int __radix_tree_preload(gfp_t gfp_mask, int nr)
+static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
@@ -411,6 +443,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(radix_tree_maybe_preload);
 
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/*
+ * Preload with enough objects to ensure that we can split a single entry
+ * of order @old_order into many entries of size @new_order
+ */
+int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
+							gfp_t gfp_mask)
+{
+	unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
+	unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
+				(new_order / RADIX_TREE_MAP_SHIFT);
+	unsigned nr = 0;
+
+	WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
+	BUG_ON(new_order >= old_order);
+
+	while (layers--)
+		nr = nr * RADIX_TREE_MAP_SIZE + 1;
+	return __radix_tree_preload(gfp_mask, top * nr);
+}
+#endif
+
 /*
  * The same as function above, but preload number of nodes required to insert
  * (1 << order) continuous naturally-aligned elements.
@@ -456,19 +510,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
 	return __radix_tree_preload(gfp_mask, nr_nodes);
 }
 
-/*
- * The maximum index which can be stored in a radix tree
- */
-static inline unsigned long shift_maxindex(unsigned int shift)
-{
-	return (RADIX_TREE_MAP_SIZE << shift) - 1;
-}
-
-static inline unsigned long node_maxindex(struct radix_tree_node *node)
-{
-	return shift_maxindex(node->shift);
-}
-
 static unsigned radix_tree_load_root(struct radix_tree_root *root,
 		struct radix_tree_node **nodep, unsigned long *maxindex)
 {
@@ -506,8 +547,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		goto out;
 
 	do {
-		struct radix_tree_node *node = radix_tree_node_alloc(root);
-
+		struct radix_tree_node *node = radix_tree_node_alloc(root,
+							NULL, shift, 0, 1, 0);
 		if (!node)
 			return -ENOMEM;
 
@@ -518,12 +559,12 @@ static int radix_tree_extend(struct radix_tree_root *root,
 		}
 
 		BUG_ON(shift > BITS_PER_LONG);
-		node->shift = shift;
-		node->offset = 0;
-		node->count = 1;
-		node->parent = NULL;
-		if (radix_tree_is_internal_node(slot))
+		if (radix_tree_is_internal_node(slot)) {
 			entry_to_node(slot)->parent = node;
+		} else if (radix_tree_exceptional_entry(slot)) {
+			/* Moving an exceptional root->rnode to a node */
+			node->exceptional = 1;
+		}
 		node->slots[0] = slot;
 		slot = node_to_entry(node);
 		rcu_assign_pointer(root->rnode, slot);
@@ -534,6 +575,106 @@ out:
 }
 
 /**
+ *	radix_tree_shrink    -    shrink radix tree to minimum height
+ *	@root		radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root,
+				     radix_tree_update_node_t update_node,
+				     void *private)
+{
+	for (;;) {
+		struct radix_tree_node *node = root->rnode;
+		struct radix_tree_node *child;
+
+		if (!radix_tree_is_internal_node(node))
+			break;
+		node = entry_to_node(node);
+
+		/*
+		 * The candidate node has more than one child, or its child
+		 * is not at the leftmost slot, or the child is a multiorder
+		 * entry, we cannot shrink.
+		 */
+		if (node->count != 1)
+			break;
+		child = node->slots[0];
+		if (!child)
+			break;
+		if (!radix_tree_is_internal_node(child) && node->shift)
+			break;
+
+		if (radix_tree_is_internal_node(child))
+			entry_to_node(child)->parent = NULL;
+
+		/*
+		 * We don't need rcu_assign_pointer(), since we are simply
+		 * moving the node from one part of the tree to another: if it
+		 * was safe to dereference the old pointer to it
+		 * (node->slots[0]), it will be safe to dereference the new
+		 * one (root->rnode) as far as dependent read barriers go.
+		 */
+		root->rnode = child;
+
+		/*
+		 * We have a dilemma here. The node's slot[0] must not be
+		 * NULLed in case there are concurrent lookups expecting to
+		 * find the item. However if this was a bottom-level node,
+		 * then it may be subject to the slot pointer being visible
+		 * to callers dereferencing it. If item corresponding to
+		 * slot[0] is subsequently deleted, these callers would expect
+		 * their slot to become empty sooner or later.
+		 *
+		 * For example, lockless pagecache will look up a slot, deref
+		 * the page pointer, and if the page has 0 refcount it means it
+		 * was concurrently deleted from pagecache so try the deref
+		 * again. Fortunately there is already a requirement for logic
+		 * to retry the entire slot lookup -- the indirect pointer
+		 * problem (replacing direct root node with an indirect pointer
+		 * also results in a stale slot). So tag the slot as indirect
+		 * to force callers to retry.
+		 */
+		node->count = 0;
+		if (!radix_tree_is_internal_node(child)) {
+			node->slots[0] = RADIX_TREE_RETRY;
+			if (update_node)
+				update_node(node, private);
+		}
+
+		WARN_ON_ONCE(!list_empty(&node->private_list));
+		radix_tree_node_free(node);
+	}
+}
+
+static void delete_node(struct radix_tree_root *root,
+			struct radix_tree_node *node,
+			radix_tree_update_node_t update_node, void *private)
+{
+	do {
+		struct radix_tree_node *parent;
+
+		if (node->count) {
+			if (node == entry_to_node(root->rnode))
+				radix_tree_shrink(root, update_node, private);
+			return;
+		}
+
+		parent = node->parent;
+		if (parent) {
+			parent->slots[node->offset] = NULL;
+			parent->count--;
+		} else {
+			root_tag_clear_all(root);
+			root->rnode = NULL;
+		}
+
+		WARN_ON_ONCE(!list_empty(&node->private_list));
+		radix_tree_node_free(node);
+
+		node = parent;
+	} while (node);
+}
+
+/**
  *	__radix_tree_create	-	create a slot in a radix tree
  *	@root:		radix tree root
  *	@index:		index key
@@ -563,26 +704,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 	shift = radix_tree_load_root(root, &child, &maxindex);
 
 	/* Make sure the tree is high enough.  */
+	if (order > 0 && max == ((1UL << order) - 1))
+		max++;
 	if (max > maxindex) {
 		int error = radix_tree_extend(root, max, shift);
 		if (error < 0)
 			return error;
 		shift = error;
 		child = root->rnode;
-		if (order == shift)
-			shift += RADIX_TREE_MAP_SHIFT;
 	}
 
 	while (shift > order) {
 		shift -= RADIX_TREE_MAP_SHIFT;
 		if (child == NULL) {
 			/* Have to add a child node.  */
-			child = radix_tree_node_alloc(root);
+			child = radix_tree_node_alloc(root, node, shift,
+							offset, 0, 0);
 			if (!child)
 				return -ENOMEM;
-			child->shift = shift;
-			child->offset = offset;
-			child->parent = node;
 			rcu_assign_pointer(*slot, node_to_entry(child));
 			if (node)
 				node->count++;
@@ -595,31 +734,126 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		slot = &node->slots[offset];
 	}
 
+	if (nodep)
+		*nodep = node;
+	if (slotp)
+		*slotp = slot;
+	return 0;
+}
+
 #ifdef CONFIG_RADIX_TREE_MULTIORDER
-	/* Insert pointers to the canonical entry */
-	if (order > shift) {
-		unsigned i, n = 1 << (order - shift);
+/*
+ * Free any nodes below this node.  The tree is presumed to not need
+ * shrinking, and any user data in the tree is presumed to not need a
+ * destructor called on it.  If we need to add a destructor, we can
+ * add that functionality later.  Note that we may not clear tags or
+ * slots from the tree as an RCU walker may still have a pointer into
+ * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
+ * but we'll still have to clear those in rcu_free.
+ */
+static void radix_tree_free_nodes(struct radix_tree_node *node)
+{
+	unsigned offset = 0;
+	struct radix_tree_node *child = entry_to_node(node);
+
+	for (;;) {
+		void *entry = child->slots[offset];
+		if (radix_tree_is_internal_node(entry) &&
+					!is_sibling_entry(child, entry)) {
+			child = entry_to_node(entry);
+			offset = 0;
+			continue;
+		}
+		offset++;
+		while (offset == RADIX_TREE_MAP_SIZE) {
+			struct radix_tree_node *old = child;
+			offset = child->offset + 1;
+			child = child->parent;
+			WARN_ON_ONCE(!list_empty(&old->private_list));
+			radix_tree_node_free(old);
+			if (old == entry_to_node(node))
+				return;
+		}
+	}
+}
+
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+				void *item, unsigned order, bool replace)
+{
+	struct radix_tree_node *child;
+	unsigned i, n, tag, offset, tags = 0;
+
+	if (node) {
+		if (order > node->shift)
+			n = 1 << (order - node->shift);
+		else
+			n = 1;
+		offset = get_slot_offset(node, slot);
+	} else {
+		n = 1;
+		offset = 0;
+	}
+
+	if (n > 1) {
 		offset = offset & ~(n - 1);
 		slot = &node->slots[offset];
-		child = node_to_entry(slot);
-		for (i = 0; i < n; i++) {
-			if (slot[i])
+	}
+	child = node_to_entry(slot);
+
+	for (i = 0; i < n; i++) {
+		if (slot[i]) {
+			if (replace) {
+				node->count--;
+				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+					if (tag_get(node, tag, offset + i))
+						tags |= 1 << tag;
+			} else
 				return -EEXIST;
 		}
+	}
 
-		for (i = 1; i < n; i++) {
+	for (i = 0; i < n; i++) {
+		struct radix_tree_node *old = slot[i];
+		if (i) {
 			rcu_assign_pointer(slot[i], child);
-			node->count++;
+			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+				if (tags & (1 << tag))
+					tag_clear(node, tag, offset + i);
+		} else {
+			rcu_assign_pointer(slot[i], item);
+			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+				if (tags & (1 << tag))
+					tag_set(node, tag, offset);
 		}
+		if (radix_tree_is_internal_node(old) &&
+					!is_sibling_entry(node, old) &&
+					(old != RADIX_TREE_RETRY))
+			radix_tree_free_nodes(old);
+		if (radix_tree_exceptional_entry(old))
+			node->exceptional--;
 	}
-#endif
-
-	if (nodep)
-		*nodep = node;
-	if (slotp)
-		*slotp = slot;
-	return 0;
+	if (node) {
+		node->count += n;
+		if (radix_tree_exceptional_entry(item))
+			node->exceptional += n;
+	}
+	return n;
 }
+#else
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+				void *item, unsigned order, bool replace)
+{
+	if (*slot)
+		return -EEXIST;
+	rcu_assign_pointer(*slot, item);
+	if (node) {
+		node->count++;
+		if (radix_tree_exceptional_entry(item))
+			node->exceptional++;
+	}
+	return 1;
+}
+#endif
 
 /**
  *	__radix_tree_insert    -    insert into a radix tree
@@ -642,13 +876,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 	error = __radix_tree_create(root, index, order, &node, &slot);
 	if (error)
 		return error;
-	if (*slot != NULL)
-		return -EEXIST;
-	rcu_assign_pointer(*slot, item);
+
+	error = insert_entries(node, slot, item, order, false);
+	if (error < 0)
+		return error;
 
 	if (node) {
 		unsigned offset = get_slot_offset(node, slot);
-		node->count++;
 		BUG_ON(tag_get(node, 0, offset));
 		BUG_ON(tag_get(node, 1, offset));
 		BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +980,287 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
+static inline int slot_count(struct radix_tree_node *node,
+						void **slot)
+{
+	int n = 1;
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	void *ptr = node_to_entry(slot);
+	unsigned offset = get_slot_offset(node, slot);
+	int i;
+
+	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+		if (node->slots[offset + i] != ptr)
+			break;
+		n++;
+	}
+#endif
+	return n;
+}
+
+static void replace_slot(struct radix_tree_root *root,
+			 struct radix_tree_node *node,
+			 void **slot, void *item,
+			 bool warn_typeswitch)
+{
+	void *old = rcu_dereference_raw(*slot);
+	int count, exceptional;
+
+	WARN_ON_ONCE(radix_tree_is_internal_node(item));
+
+	count = !!item - !!old;
+	exceptional = !!radix_tree_exceptional_entry(item) -
+		      !!radix_tree_exceptional_entry(old);
+
+	WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
+
+	if (node) {
+		node->count += count;
+		if (exceptional) {
+			exceptional *= slot_count(node, slot);
+			node->exceptional += exceptional;
+		}
+	}
+
+	rcu_assign_pointer(*slot, item);
+}
+
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+						void **slot)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	bool exceptional = radix_tree_exceptional_entry(*slot);
+	void *ptr = node_to_entry(slot);
+	unsigned offset = get_slot_offset(node, slot);
+	int i;
+
+	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+		if (node->slots[offset + i] != ptr)
+			break;
+		node->slots[offset + i] = NULL;
+		node->count--;
+		if (exceptional)
+			node->exceptional--;
+	}
+#endif
+}
+
+/**
+ * __radix_tree_replace		- replace item in a slot
+ * @root:		radix tree root
+ * @node:		pointer to tree node
+ * @slot:		pointer to slot in @node
+ * @item:		new item to store in the slot.
+ * @update_node:	callback for changing leaf nodes
+ * @private:		private data to pass to @update_node
+ *
+ * For use with __radix_tree_lookup().  Caller must hold tree write locked
+ * across slot lookup and replacement.
+ */
+void __radix_tree_replace(struct radix_tree_root *root,
+			  struct radix_tree_node *node,
+			  void **slot, void *item,
+			  radix_tree_update_node_t update_node, void *private)
+{
+	if (!item)
+		delete_sibling_entries(node, slot);
+	/*
+	 * This function supports replacing exceptional entries and
+	 * deleting entries, but that needs accounting against the
+	 * node unless the slot is root->rnode.
+	 */
+	replace_slot(root, node, slot, item,
+		     !node && slot != (void **)&root->rnode);
+
+	if (!node)
+		return;
+
+	if (update_node)
+		update_node(node, private);
+
+	delete_node(root, node, update_node, private);
+}
+
+/**
+ * radix_tree_replace_slot	- replace item in a slot
+ * @root:	radix tree root
+ * @slot:	pointer to slot
+ * @item:	new item to store in the slot.
+ *
+ * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
+ * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
+ * across slot lookup and replacement.
+ *
+ * NOTE: This cannot be used to switch between non-entries (empty slots),
+ * regular entries, and exceptional entries, as that requires accounting
+ * inside the radix tree node. When switching from one type of entry or
+ * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
+ * radix_tree_iter_replace().
+ */
+void radix_tree_replace_slot(struct radix_tree_root *root,
+			     void **slot, void *item)
+{
+	replace_slot(root, NULL, slot, item, true);
+}
+
+/**
+ * radix_tree_iter_replace - replace item in a slot
+ * @root:	radix tree root
+ * @slot:	pointer to slot
+ * @item:	new item to store in the slot.
+ *
+ * For use with radix_tree_split() and radix_tree_for_each_slot().
+ * Caller must hold tree write locked across split and replacement.
+ */
+void radix_tree_iter_replace(struct radix_tree_root *root,
+		const struct radix_tree_iter *iter, void **slot, void *item)
+{
+	__radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
+}
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+/**
+ * radix_tree_join - replace multiple entries with one multiorder entry
+ * @root: radix tree root
+ * @index: an index inside the new entry
+ * @order: order of the new entry
+ * @item: new entry
+ *
+ * Call this function to replace several entries with one larger entry.
+ * The existing entries are presumed to not need freeing as a result of
+ * this call.
+ *
+ * The replacement entry will have all the tags set on it that were set
+ * on any of the entries it is replacing.
+ */
+int radix_tree_join(struct radix_tree_root *root, unsigned long index,
+			unsigned order, void *item)
+{
+	struct radix_tree_node *node;
+	void **slot;
+	int error;
+
+	BUG_ON(radix_tree_is_internal_node(item));
+
+	error = __radix_tree_create(root, index, order, &node, &slot);
+	if (!error)
+		error = insert_entries(node, slot, item, order, true);
+	if (error > 0)
+		error = 0;
+
+	return error;
+}
+
+/**
+ * radix_tree_split - Split an entry into smaller entries
+ * @root: radix tree root
+ * @index: An index within the large entry
+ * @order: Order of new entries
+ *
+ * Call this function as the first step in replacing a multiorder entry
+ * with several entries of lower order.  After this function returns,
+ * loop over the relevant portion of the tree using radix_tree_for_each_slot()
+ * and call radix_tree_iter_replace() to set up each new entry.
+ *
+ * The tags from this entry are replicated to all the new entries.
+ *
+ * The radix tree should be locked against modification during the entire
+ * replacement operation.  Lock-free lookups will see RADIX_TREE_RETRY which
+ * should prompt RCU walkers to restart the lookup from the root.
+ */
+int radix_tree_split(struct radix_tree_root *root, unsigned long index,
+				unsigned order)
+{
+	struct radix_tree_node *parent, *node, *child;
+	void **slot;
+	unsigned int offset, end;
+	unsigned n, tag, tags = 0;
+
+	if (!__radix_tree_lookup(root, index, &parent, &slot))
+		return -ENOENT;
+	if (!parent)
+		return -ENOENT;
+
+	offset = get_slot_offset(parent, slot);
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		if (tag_get(parent, tag, offset))
+			tags |= 1 << tag;
+
+	for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
+		if (!is_sibling_entry(parent, parent->slots[end]))
+			break;
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			if (tags & (1 << tag))
+				tag_set(parent, tag, end);
+		/* rcu_assign_pointer ensures tags are set before RETRY */
+		rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
+	}
+	rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
+	parent->exceptional -= (end - offset);
+
+	if (order == parent->shift)
+		return 0;
+	if (order > parent->shift) {
+		while (offset < end)
+			offset += insert_entries(parent, &parent->slots[offset],
+					RADIX_TREE_RETRY, order, true);
+		return 0;
+	}
+
+	node = parent;
+
+	for (;;) {
+		if (node->shift > order) {
+			child = radix_tree_node_alloc(root, node,
+					node->shift - RADIX_TREE_MAP_SHIFT,
+					offset, 0, 0);
+			if (!child)
+				goto nomem;
+			if (node != parent) {
+				node->count++;
+				node->slots[offset] = node_to_entry(child);
+				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+					if (tags & (1 << tag))
+						tag_set(node, tag, offset);
+			}
+
+			node = child;
+			offset = 0;
+			continue;
+		}
+
+		n = insert_entries(node, &node->slots[offset],
+					RADIX_TREE_RETRY, order, false);
+		BUG_ON(n > RADIX_TREE_MAP_SIZE);
+
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			if (tags & (1 << tag))
+				tag_set(node, tag, offset);
+		offset += n;
+
+		while (offset == RADIX_TREE_MAP_SIZE) {
+			if (node == parent)
+				break;
+			offset = node->offset;
+			child = node;
+			node = node->parent;
+			rcu_assign_pointer(node->slots[offset],
+						node_to_entry(child));
+			offset++;
+		}
+		if ((node == parent) && (offset == end))
+			return 0;
+	}
+
+ nomem:
+	/* Shouldn't happen; did user forget to preload? */
+	/* TODO: free all the allocated nodes */
+	WARN_ON(1);
+	return -ENOMEM;
+}
+#endif
+
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
  *	@root:		radix tree root
@@ -807,6 +1322,34 @@ static void node_tag_clear(struct radix_tree_root *root,
 		root_tag_clear(root, tag);
 }
 
+static void node_tag_set(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (tag_get(node, tag, offset))
+			return;
+		tag_set(node, tag, offset);
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	if (!root_tag_get(root, tag))
+		root_tag_set(root, tag);
+}
+
+/**
+ * radix_tree_iter_tag_set - set a tag on the current iterator entry
+ * @root:	radix tree root
+ * @iter:	iterator state
+ * @tag:	tag to set
+ */
+void radix_tree_iter_tag_set(struct radix_tree_root *root,
+			const struct radix_tree_iter *iter, unsigned int tag)
+{
+	node_tag_set(root, iter->node, tag, iter_offset(iter));
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -902,6 +1445,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
 #endif
 }
 
+/* Construct iter->tags bit-mask from node->tags[tag] array */
+static void set_iter_tags(struct radix_tree_iter *iter,
+				struct radix_tree_node *node, unsigned offset,
+				unsigned tag)
+{
+	unsigned tag_long = offset / BITS_PER_LONG;
+	unsigned tag_bit  = offset % BITS_PER_LONG;
+
+	iter->tags = node->tags[tag][tag_long] >> tag_bit;
+
+	/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
+	if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
+		/* Pick tags from next element */
+		if (tag_bit)
+			iter->tags |= node->tags[tag][tag_long + 1] <<
+						(BITS_PER_LONG - tag_bit);
+		/* Clip chunk size, here only BITS_PER_LONG tags */
+		iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
+	}
+}
+
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+static void **skip_siblings(struct radix_tree_node **nodep,
+			void **slot, struct radix_tree_iter *iter)
+{
+	void *sib = node_to_entry(slot - 1);
+
+	while (iter->index < iter->next_index) {
+		*nodep = rcu_dereference_raw(*slot);
+		if (*nodep && *nodep != sib)
+			return slot;
+		slot++;
+		iter->index = __radix_tree_iter_add(iter, 1);
+		iter->tags >>= 1;
+	}
+
+	*nodep = NULL;
+	return NULL;
+}
+
+void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
+					unsigned flags)
+{
+	unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
+	struct radix_tree_node *node = rcu_dereference_raw(*slot);
+
+	slot = skip_siblings(&node, slot, iter);
+
+	while (radix_tree_is_internal_node(node)) {
+		unsigned offset;
+		unsigned long next_index;
+
+		if (node == RADIX_TREE_RETRY)
+			return slot;
+		node = entry_to_node(node);
+		iter->node = node;
+		iter->shift = node->shift;
+
+		if (flags & RADIX_TREE_ITER_TAGGED) {
+			offset = radix_tree_find_next_bit(node, tag, 0);
+			if (offset == RADIX_TREE_MAP_SIZE)
+				return NULL;
+			slot = &node->slots[offset];
+			iter->index = __radix_tree_iter_add(iter, offset);
+			set_iter_tags(iter, node, offset, tag);
+			node = rcu_dereference_raw(*slot);
+		} else {
+			offset = 0;
+			slot = &node->slots[0];
+			for (;;) {
+				node = rcu_dereference_raw(*slot);
+				if (node)
+					break;
+				slot++;
+				offset++;
+				if (offset == RADIX_TREE_MAP_SIZE)
+					return NULL;
+			}
+			iter->index = __radix_tree_iter_add(iter, offset);
+		}
+		if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
+			goto none;
+		next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
+		if (next_index < iter->next_index)
+			iter->next_index = next_index;
+	}
+
+	return slot;
+ none:
+	iter->next_index = 0;
+	return NULL;
+}
+EXPORT_SYMBOL(__radix_tree_next_slot);
+#else
+static void **skip_siblings(struct radix_tree_node **nodep,
+			void **slot, struct radix_tree_iter *iter)
+{
+	return slot;
+}
+#endif
+
+void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
+{
+	struct radix_tree_node *node;
+
+	slot++;
+	iter->index = __radix_tree_iter_add(iter, 1);
+	node = rcu_dereference_raw(*slot);
+	skip_siblings(&node, slot, iter);
+	iter->next_index = iter->index;
+	iter->tags = 0;
+	return NULL;
+}
+EXPORT_SYMBOL(radix_tree_iter_resume);
+
 /**
  * radix_tree_next_chunk - find next chunk of slots for iteration
  *
@@ -927,7 +1585,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 	 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
 	 *
 	 * This condition also used by radix_tree_next_slot() to stop
-	 * contiguous iterating, and forbid swithing to the next chunk.
+	 * contiguous iterating, and forbid switching to the next chunk.
 	 */
 	index = iter->next_index;
 	if (!index && iter->index)
@@ -945,6 +1603,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 		iter->index = index;
 		iter->next_index = maxindex + 1;
 		iter->tags = 1;
+		iter->node = NULL;
 		__set_iter_shift(iter, 0);
 		return (void **)&root->rnode;
 	}
@@ -960,9 +1619,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 				return NULL;
 
 			if (flags & RADIX_TREE_ITER_TAGGED)
-				offset = radix_tree_find_next_bit(
-						node->tags[tag],
-						RADIX_TREE_MAP_SIZE,
+				offset = radix_tree_find_next_bit(node, tag,
 						offset + 1);
 			else
 				while (++offset	< RADIX_TREE_MAP_SIZE) {
@@ -982,154 +1639,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
 			child = rcu_dereference_raw(node->slots[offset]);
 		}
 
-		if ((child == NULL) || (child == RADIX_TREE_RETRY))
+		if (!child)
 			goto restart;
+		if (child == RADIX_TREE_RETRY)
+			break;
 	} while (radix_tree_is_internal_node(child));
 
 	/* Update the iterator state */
 	iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
 	iter->next_index = (index | node_maxindex(node)) + 1;
+	iter->node = node;
 	__set_iter_shift(iter, node->shift);
 
-	/* Construct iter->tags bit-mask from node->tags[tag] array */
-	if (flags & RADIX_TREE_ITER_TAGGED) {
-		unsigned tag_long, tag_bit;
-
-		tag_long = offset / BITS_PER_LONG;
-		tag_bit  = offset % BITS_PER_LONG;
-		iter->tags = node->tags[tag][tag_long] >> tag_bit;
-		/* This never happens if RADIX_TREE_TAG_LONGS == 1 */
-		if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
-			/* Pick tags from next element */
-			if (tag_bit)
-				iter->tags |= node->tags[tag][tag_long + 1] <<
-						(BITS_PER_LONG - tag_bit);
-			/* Clip chunk size, here only BITS_PER_LONG tags */
-			iter->next_index = index + BITS_PER_LONG;
-		}
-	}
+	if (flags & RADIX_TREE_ITER_TAGGED)
+		set_iter_tags(iter, node, offset, tag);
 
 	return node->slots + offset;
 }
 EXPORT_SYMBOL(radix_tree_next_chunk);
 
 /**
- * radix_tree_range_tag_if_tagged - for each item in given range set given
- *				   tag if item has another tag set
- * @root:		radix tree root
- * @first_indexp:	pointer to a starting index of a range to scan
- * @last_index:		last index of a range to scan
- * @nr_to_tag:		maximum number items to tag
- * @iftag:		tag index to test
- * @settag:		tag index to set if tested tag is set
- *
- * This function scans range of radix tree from first_index to last_index
- * (inclusive).  For each item in the range if iftag is set, the function sets
- * also settag. The function stops either after tagging nr_to_tag items or
- * after reaching last_index.
- *
- * The tags must be set from the leaf level only and propagated back up the
- * path to the root. We must do this so that we resolve the full path before
- * setting any tags on intermediate nodes. If we set tags as we descend, then
- * we can get to the leaf node and find that the index that has the iftag
- * set is outside the range we are scanning. This reults in dangling tags and
- * can lead to problems with later tag operations (e.g. livelocks on lookups).
- *
- * The function returns the number of leaves where the tag was set and sets
- * *first_indexp to the first unscanned index.
- * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
- * be prepared to handle that.
- */
-unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
-		unsigned long *first_indexp, unsigned long last_index,
-		unsigned long nr_to_tag,
-		unsigned int iftag, unsigned int settag)
-{
-	struct radix_tree_node *parent, *node, *child;
-	unsigned long maxindex;
-	unsigned long tagged = 0;
-	unsigned long index = *first_indexp;
-
-	radix_tree_load_root(root, &child, &maxindex);
-	last_index = min(last_index, maxindex);
-	if (index > last_index)
-		return 0;
-	if (!nr_to_tag)
-		return 0;
-	if (!root_tag_get(root, iftag)) {
-		*first_indexp = last_index + 1;
-		return 0;
-	}
-	if (!radix_tree_is_internal_node(child)) {
-		*first_indexp = last_index + 1;
-		root_tag_set(root, settag);
-		return 1;
-	}
-
-	node = entry_to_node(child);
-
-	for (;;) {
-		unsigned offset = radix_tree_descend(node, &child, index);
-		if (!child)
-			goto next;
-		if (!tag_get(node, iftag, offset))
-			goto next;
-		/* Sibling slots never have tags set on them */
-		if (radix_tree_is_internal_node(child)) {
-			node = entry_to_node(child);
-			continue;
-		}
-
-		/* tag the leaf */
-		tagged++;
-		tag_set(node, settag, offset);
-
-		/* walk back up the path tagging interior nodes */
-		parent = node;
-		for (;;) {
-			offset = parent->offset;
-			parent = parent->parent;
-			if (!parent)
-				break;
-			/* stop if we find a node with the tag already set */
-			if (tag_get(parent, settag, offset))
-				break;
-			tag_set(parent, settag, offset);
-		}
- next:
-		/* Go to next entry in node */
-		index = ((index >> node->shift) + 1) << node->shift;
-		/* Overflow can happen when last_index is ~0UL... */
-		if (index > last_index || !index)
-			break;
-		offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
-		while (offset == 0) {
-			/*
-			 * We've fully scanned this node. Go up. Because
-			 * last_index is guaranteed to be in the tree, what
-			 * we do below cannot wander astray.
-			 */
-			node = node->parent;
-			offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
-		}
-		if (is_sibling_entry(node, node->slots[offset]))
-			goto next;
-		if (tagged >= nr_to_tag)
-			break;
-	}
-	/*
-	 * We need not to tag the root tag if there is no tag which is set with
-	 * settag within the range from *first_indexp to last_index.
-	 */
-	if (tagged > 0)
-		root_tag_set(root, settag);
-	*first_indexp = index;
-
-	return tagged;
-}
-EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
-
-/**
  *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
  *	@root:		radix tree root
  *	@results:	where the results of the lookup are placed
@@ -1294,229 +1823,23 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
 
-#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
-#include <linux/sched.h> /* for cond_resched() */
-
-struct locate_info {
-	unsigned long found_index;
-	bool stop;
-};
-
-/*
- * This linear search is at present only useful to shmem_unuse_inode().
- */
-static unsigned long __locate(struct radix_tree_node *slot, void *item,
-			      unsigned long index, struct locate_info *info)
-{
-	unsigned long i;
-
-	do {
-		unsigned int shift = slot->shift;
-
-		for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
-		     i < RADIX_TREE_MAP_SIZE;
-		     i++, index += (1UL << shift)) {
-			struct radix_tree_node *node =
-					rcu_dereference_raw(slot->slots[i]);
-			if (node == RADIX_TREE_RETRY)
-				goto out;
-			if (!radix_tree_is_internal_node(node)) {
-				if (node == item) {
-					info->found_index = index;
-					info->stop = true;
-					goto out;
-				}
-				continue;
-			}
-			node = entry_to_node(node);
-			if (is_sibling_entry(slot, node))
-				continue;
-			slot = node;
-			break;
-		}
-	} while (i < RADIX_TREE_MAP_SIZE);
-
-out:
-	if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
-		info->stop = true;
-	return index;
-}
-
-/**
- *	radix_tree_locate_item - search through radix tree for item
- *	@root:		radix tree root
- *	@item:		item to be found
- *
- *	Returns index where item was found, or -1 if not found.
- *	Caller must hold no lock (since this time-consuming function needs
- *	to be preemptible), and must check afterwards if item is still there.
- */
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
-	struct radix_tree_node *node;
-	unsigned long max_index;
-	unsigned long cur_index = 0;
-	struct locate_info info = {
-		.found_index = -1,
-		.stop = false,
-	};
-
-	do {
-		rcu_read_lock();
-		node = rcu_dereference_raw(root->rnode);
-		if (!radix_tree_is_internal_node(node)) {
-			rcu_read_unlock();
-			if (node == item)
-				info.found_index = 0;
-			break;
-		}
-
-		node = entry_to_node(node);
-
-		max_index = node_maxindex(node);
-		if (cur_index > max_index) {
-			rcu_read_unlock();
-			break;
-		}
-
-		cur_index = __locate(node, item, cur_index, &info);
-		rcu_read_unlock();
-		cond_resched();
-	} while (!info.stop && cur_index <= max_index);
-
-	return info.found_index;
-}
-#else
-unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
-{
-	return -1;
-}
-#endif /* CONFIG_SHMEM && CONFIG_SWAP */
-
-/**
- *	radix_tree_shrink    -    shrink radix tree to minimum height
- *	@root		radix tree root
- */
-static inline bool radix_tree_shrink(struct radix_tree_root *root)
-{
-	bool shrunk = false;
-
-	for (;;) {
-		struct radix_tree_node *node = root->rnode;
-		struct radix_tree_node *child;
-
-		if (!radix_tree_is_internal_node(node))
-			break;
-		node = entry_to_node(node);
-
-		/*
-		 * The candidate node has more than one child, or its child
-		 * is not at the leftmost slot, or the child is a multiorder
-		 * entry, we cannot shrink.
-		 */
-		if (node->count != 1)
-			break;
-		child = node->slots[0];
-		if (!child)
-			break;
-		if (!radix_tree_is_internal_node(child) && node->shift)
-			break;
-
-		if (radix_tree_is_internal_node(child))
-			entry_to_node(child)->parent = NULL;
-
-		/*
-		 * We don't need rcu_assign_pointer(), since we are simply
-		 * moving the node from one part of the tree to another: if it
-		 * was safe to dereference the old pointer to it
-		 * (node->slots[0]), it will be safe to dereference the new
-		 * one (root->rnode) as far as dependent read barriers go.
-		 */
-		root->rnode = child;
-
-		/*
-		 * We have a dilemma here. The node's slot[0] must not be
-		 * NULLed in case there are concurrent lookups expecting to
-		 * find the item. However if this was a bottom-level node,
-		 * then it may be subject to the slot pointer being visible
-		 * to callers dereferencing it. If item corresponding to
-		 * slot[0] is subsequently deleted, these callers would expect
-		 * their slot to become empty sooner or later.
-		 *
-		 * For example, lockless pagecache will look up a slot, deref
-		 * the page pointer, and if the page has 0 refcount it means it
-		 * was concurrently deleted from pagecache so try the deref
-		 * again. Fortunately there is already a requirement for logic
-		 * to retry the entire slot lookup -- the indirect pointer
-		 * problem (replacing direct root node with an indirect pointer
-		 * also results in a stale slot). So tag the slot as indirect
-		 * to force callers to retry.
-		 */
-		if (!radix_tree_is_internal_node(child))
-			node->slots[0] = RADIX_TREE_RETRY;
-
-		radix_tree_node_free(node);
-		shrunk = true;
-	}
-
-	return shrunk;
-}
-
 /**
  *	__radix_tree_delete_node    -    try to free node after clearing a slot
  *	@root:		radix tree root
  *	@node:		node containing @index
+ *	@update_node:	callback for changing leaf nodes
+ *	@private:	private data to pass to @update_node
  *
  *	After clearing the slot at @index in @node from radix tree
  *	rooted at @root, call this function to attempt freeing the
  *	node and shrinking the tree.
- *
- *	Returns %true if @node was freed, %false otherwise.
  */
-bool __radix_tree_delete_node(struct radix_tree_root *root,
-			      struct radix_tree_node *node)
-{
-	bool deleted = false;
-
-	do {
-		struct radix_tree_node *parent;
-
-		if (node->count) {
-			if (node == entry_to_node(root->rnode))
-				deleted |= radix_tree_shrink(root);
-			return deleted;
-		}
-
-		parent = node->parent;
-		if (parent) {
-			parent->slots[node->offset] = NULL;
-			parent->count--;
-		} else {
-			root_tag_clear_all(root);
-			root->rnode = NULL;
-		}
-
-		radix_tree_node_free(node);
-		deleted = true;
-
-		node = parent;
-	} while (node);
-
-	return deleted;
-}
-
-static inline void delete_sibling_entries(struct radix_tree_node *node,
-					void *ptr, unsigned offset)
+void __radix_tree_delete_node(struct radix_tree_root *root,
+			      struct radix_tree_node *node,
+			      radix_tree_update_node_t update_node,
+			      void *private)
 {
-#ifdef CONFIG_RADIX_TREE_MULTIORDER
-	int i;
-	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-		if (node->slots[offset + i] != ptr)
-			break;
-		node->slots[offset + i] = NULL;
-		node->count--;
-	}
-#endif
+	delete_node(root, node, update_node, private);
 }
 
 /**
@@ -1558,11 +1881,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
 		node_tag_clear(root, node, tag, offset);
 
-	delete_sibling_entries(node, node_to_entry(slot), offset);
-	node->slots[offset] = NULL;
-	node->count--;
-
-	__radix_tree_delete_node(root, node);
+	__radix_tree_replace(root, node, slot, NULL, NULL, NULL);
 
 	return entry;
 }
@@ -1583,15 +1902,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
-struct radix_tree_node *radix_tree_replace_clear_tags(
-			struct radix_tree_root *root,
-			unsigned long index, void *entry)
+void radix_tree_clear_tags(struct radix_tree_root *root,
+			   struct radix_tree_node *node,
+			   void **slot)
 {
-	struct radix_tree_node *node;
-	void **slot;
-
-	__radix_tree_lookup(root, index, &node, &slot);
-
 	if (node) {
 		unsigned int tag, offset = get_slot_offset(node, slot);
 		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
@@ -1600,9 +1914,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags(
 		/* Clear root node tags */
 		root->gfp_mask &= __GFP_BITS_MASK;
 	}
-
-	radix_tree_replace_slot(slot, entry);
-	return node;
 }
 
 /**
@@ -1650,32 +1961,31 @@ static __init void radix_tree_init_maxnodes(void)
 	}
 }
 
-static int radix_tree_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int radix_tree_cpu_dead(unsigned int cpu)
 {
-	int cpu = (long)hcpu;
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
 
 	/* Free per-cpu pool of preloaded nodes */
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		rtp = &per_cpu(radix_tree_preloads, cpu);
-		while (rtp->nr) {
-			node = rtp->nodes;
-			rtp->nodes = node->private_data;
-			kmem_cache_free(radix_tree_node_cachep, node);
-			rtp->nr--;
-		}
+	rtp = &per_cpu(radix_tree_preloads, cpu);
+	while (rtp->nr) {
+		node = rtp->nodes;
+		rtp->nodes = node->private_data;
+		kmem_cache_free(radix_tree_node_cachep, node);
+		rtp->nr--;
 	}
-	return NOTIFY_OK;
+	return 0;
 }
 
 void __init radix_tree_init(void)
 {
+	int ret;
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
 			radix_tree_node_ctor);
 	radix_tree_init_maxnodes();
-	hotcpu_notifier(radix_tree_callback, 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
+					NULL, radix_tree_cpu_dead);
+	WARN_ON(ret < 0);
 }
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 29f503ebfd60..3057011f5599 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ)	+= raid6_pq.o
 raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 		   int8.o int16.o int32.o
 
-raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
+raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 592ff49df47d..7857049fd7d3 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x1,
 	&raid6_avx2x2,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+#endif
 #endif
 #if defined(__x86_64__) && !defined(__arch_um__)
 	&raid6_sse2x1,
@@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_avx2x2,
 	&raid6_avx2x4,
 #endif
+#ifdef CONFIG_AS_AVX512
+	&raid6_avx512x1,
+	&raid6_avx512x2,
+	&raid6_avx512x4,
+#endif
 #endif
 #ifdef CONFIG_ALTIVEC
 	&raid6_altivec1,
@@ -92,6 +101,9 @@ void (*raid6_datap_recov)(int, size_t, int, void **);
 EXPORT_SYMBOL_GPL(raid6_datap_recov);
 
 const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_AS_AVX512
+	&raid6_recov_avx512,
+#endif
 #ifdef CONFIG_AS_AVX2
 	&raid6_recov_avx2,
 #endif
diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c
index 76734004358d..20bca3d44f67 100644
--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 32) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+		}
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x1 = {
 	raid6_avx21_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx21_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x1",
 	1			/* Has cache hints */
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 64) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7"
+				     :: "m" (dptr[z][d+32]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+		}
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+		/* Don't use movntdq for r/w memory area < cache line */
+		asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
+		asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x2 = {
 	raid6_avx22_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx22_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x2",
 	1			/* Has cache hints */
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
 	kernel_fpu_end();
 }
 
+static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
+				     size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 128) {
+		asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+		asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+		asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
+		asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
+		asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+		asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+		asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
+		asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
+		asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+		asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+		asm volatile("vpxor %ymm12,%ymm10,%ymm10");
+		asm volatile("vpxor %ymm14,%ymm11,%ymm11");
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpand %ymm0,%ymm13,%ymm13");
+			asm volatile("vpand %ymm0,%ymm15,%ymm15");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+			asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+			asm volatile("vmovdqa %0,%%ymm7"
+				     :: "m" (dptr[z][d+32]));
+			asm volatile("vmovdqa %0,%%ymm13"
+				     :: "m" (dptr[z][d+64]));
+			asm volatile("vmovdqa %0,%%ymm15"
+				     :: "m" (dptr[z][d+96]));
+			asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+			asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+			asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+		}
+		asm volatile("prefetchnta %0" :: "m" (q[d]));
+		asm volatile("prefetchnta %0" :: "m" (q[d+64]));
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+			asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+			asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+			asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+			asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+			asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+			asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+			asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+			asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+			asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+			asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+			asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+			asm volatile("vpand %ymm0,%ymm5,%ymm5");
+			asm volatile("vpand %ymm0,%ymm7,%ymm7");
+			asm volatile("vpand %ymm0,%ymm13,%ymm13");
+			asm volatile("vpand %ymm0,%ymm15,%ymm15");
+			asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+			asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+			asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+			asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+		}
+		asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+		asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+		asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+		asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+		asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+		asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+		asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
+		asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
+		asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+		asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+		asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+		asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+	}
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
 const struct raid6_calls raid6_avx2x4 = {
 	raid6_avx24_gen_syndrome,
-	NULL,			/* XOR not yet implemented */
+	raid6_avx24_xor_syndrome,
 	raid6_have_avx2,
 	"avx2x4",
 	1			/* Has cache hints */
diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
new file mode 100644
index 000000000000..f524a7972006
--- /dev/null
+++ b/lib/raid6/avx512.c
@@ -0,0 +1,569 @@
+/* -*- linux-c -*- --------------------------------------------------------
+ *
+ *   Copyright (C) 2016 Intel Corporation
+ *
+ *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ *   Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
+ *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * AVX512 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static const struct raid6_avx512_constants {
+	u64 x1d[8];
+} raid6_avx512_constants __aligned(512) = {
+	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 64) {
+		asm volatile("prefetchnta %0\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+			     "vmovdqa64 %1,%%zmm6"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
+		for (z = z0-2; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+				     "vmovdqa64 %0,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]));
+		}
+		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm4,%1\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
+			     :
+			     : "m" (p[d]), "m" (q[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     : : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 64) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm2\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2"
+			     :
+			     : "m" (dptr[z0][d]),  "m" (p[d]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
+				     :
+				     : "m" (dptr[z][d]));
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4"
+				     :
+				     : );
+		}
+		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+		/* Don't use movntdq for r/w memory area < cache line */
+			     "vmovdqa64 %%zmm4,%0\n\t"
+			     "vmovdqa64 %%zmm2,%1"
+			     :
+			     : "m" (q[d]), "m" (p[d]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x1 = {
+	raid6_avx5121_gen_syndrome,
+	raid6_avx5121_xor_syndrome,
+	raid6_have_avx512,
+	"avx512x1",
+	1                       /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 AVX512 implementation
+ */
+static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	/* We uniformly assume a single prefetch covers at least 64 bytes */
+	for (d = 0; d < bytes; d += 128) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
+			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
+			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
+			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
+		for (z = z0-1; z >= 0; z--) {
+			asm volatile("prefetchnta %0\n\t"
+				     "prefetchnta %1\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vmovntdq %%zmm4,%2\n\t"
+			     "vmovntdq %%zmm6,%3"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
+			       "m" (q[d+64]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     : : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 128) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm6\n\t"
+			     "vmovdqa64 %2,%%zmm2\n\t"
+			     "vmovdqa64 %3,%%zmm3\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm3,%%zmm3"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+			       "m" (p[d]), "m" (p[d+64]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : "m" (dptr[z][d]),  "m" (dptr[z][d+64]));
+		}
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
+				     :
+				     : );
+		}
+		asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %1,%%zmm6,%%zmm6\n\t"
+			     /* Don't use movntdq for r/w
+			      * memory area < cache line
+			      */
+			     "vmovdqa64 %%zmm4,%0\n\t"
+			     "vmovdqa64 %%zmm6,%1\n\t"
+			     "vmovdqa64 %%zmm2,%2\n\t"
+			     "vmovdqa64 %%zmm3,%3"
+			     :
+			     : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
+			       "m" (p[d+64]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x2 = {
+	raid6_avx5122_gen_syndrome,
+	raid6_avx5122_xor_syndrome,
+	raid6_have_avx512,
+	"avx512x2",
+	1                       /* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = disks - 3;         /* Highest data disk */
+	p = dptr[z0+1];         /* XOR parity */
+	q = dptr[z0+2];         /* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
+		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
+		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
+		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
+		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
+		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
+		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
+		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
+		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
+		     :
+		     : "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0; d < bytes; d += 256) {
+		for (z = z0; z >= 0; z--) {
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     "prefetchnta %2\n\t"
+			     "prefetchnta %3\n\t"
+			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
+			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
+			     "vpmovm2b %%k1,%%zmm5\n\t"
+			     "vpmovm2b %%k2,%%zmm7\n\t"
+			     "vpmovm2b %%k3,%%zmm13\n\t"
+			     "vpmovm2b %%k4,%%zmm15\n\t"
+			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+			     "vmovdqa64 %0,%%zmm5\n\t"
+			     "vmovdqa64 %1,%%zmm7\n\t"
+			     "vmovdqa64 %2,%%zmm13\n\t"
+			     "vmovdqa64 %3,%%zmm15\n\t"
+			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
+			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+			     :
+			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
+			     "vmovntdq %%zmm10,%2\n\t"
+			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
+			     "vmovntdq %%zmm11,%3\n\t"
+			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
+			     "vmovntdq %%zmm4,%4\n\t"
+			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
+			     "vmovntdq %%zmm6,%5\n\t"
+			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
+			     "vmovntdq %%zmm12,%6\n\t"
+			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
+			     "vmovntdq %%zmm14,%7\n\t"
+			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
+			     :
+			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+			       "m" (q[d+128]), "m" (q[d+192]));
+	}
+
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+
+static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
+				       size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u8 *p, *q;
+	int d, z, z0;
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	kernel_fpu_begin();
+
+	asm volatile("vmovdqa64 %0,%%zmm0"
+		     :: "m" (raid6_avx512_constants.x1d[0]));
+
+	for (d = 0 ; d < bytes ; d += 256) {
+		asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+			     "vmovdqa64 %1,%%zmm6\n\t"
+			     "vmovdqa64 %2,%%zmm12\n\t"
+			     "vmovdqa64 %3,%%zmm14\n\t"
+			     "vmovdqa64 %4,%%zmm2\n\t"
+			     "vmovdqa64 %5,%%zmm3\n\t"
+			     "vmovdqa64 %6,%%zmm10\n\t"
+			     "vmovdqa64 %7,%%zmm11\n\t"
+			     "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+			     "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
+			     "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
+			     "vpxorq %%zmm14,%%zmm11,%%zmm11"
+			     :
+			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+			       "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
+			       "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]));
+		/* P/Q data pages */
+		for (z = z0-1 ; z >= start ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+				     "prefetchnta %0\n\t"
+				     "prefetchnta %2\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpmovm2b %%k3,%%zmm13\n\t"
+				     "vpmovm2b %%k4,%%zmm15\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+				     "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+				     "vmovdqa64 %0,%%zmm5\n\t"
+				     "vmovdqa64 %1,%%zmm7\n\t"
+				     "vmovdqa64 %2,%%zmm13\n\t"
+				     "vmovdqa64 %3,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+				     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+				     "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+				     :
+				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+				       "m" (dptr[z][d+128]),
+				       "m" (dptr[z][d+192]));
+		}
+		asm volatile("prefetchnta %0\n\t"
+			     "prefetchnta %1\n\t"
+			     :
+			     : "m" (q[d]), "m" (q[d+128]));
+		/* P/Q left side optimization */
+		for (z = start-1 ; z >= 0 ; z--) {
+			asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+				     "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+				     "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+				     "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+				     "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+				     "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+				     "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+				     "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+				     "vpmovm2b %%k1,%%zmm5\n\t"
+				     "vpmovm2b %%k2,%%zmm7\n\t"
+				     "vpmovm2b %%k3,%%zmm13\n\t"
+				     "vpmovm2b %%k4,%%zmm15\n\t"
+				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+				     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+				     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+				     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+				     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+				     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+				     "vpxorq %%zmm15,%%zmm14,%%zmm14"
+				     :
+				     : );
+		}
+		asm volatile("vmovntdq %%zmm2,%0\n\t"
+			     "vmovntdq %%zmm3,%1\n\t"
+			     "vmovntdq %%zmm10,%2\n\t"
+			     "vmovntdq %%zmm11,%3\n\t"
+			     "vpxorq %4,%%zmm4,%%zmm4\n\t"
+			     "vpxorq %5,%%zmm6,%%zmm6\n\t"
+			     "vpxorq %6,%%zmm12,%%zmm12\n\t"
+			     "vpxorq %7,%%zmm14,%%zmm14\n\t"
+			     "vmovntdq %%zmm4,%4\n\t"
+			     "vmovntdq %%zmm6,%5\n\t"
+			     "vmovntdq %%zmm12,%6\n\t"
+			     "vmovntdq %%zmm14,%7"
+			     :
+			     : "m" (p[d]),  "m" (p[d+64]), "m" (p[d+128]),
+			       "m" (p[d+192]), "m" (q[d]),  "m" (q[d+64]),
+			       "m" (q[d+128]), "m" (q[d+192]));
+	}
+	asm volatile("sfence" : : : "memory");
+	kernel_fpu_end();
+}
+const struct raid6_calls raid6_avx512x4 = {
+	raid6_avx5124_gen_syndrome,
+	raid6_avx5124_xor_syndrome,
+	raid6_have_avx512,
+	"avx512x4",
+	1                       /* Has cache hints */
+};
+#endif
+
+#endif /* CONFIG_AS_AVX512 */
diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c
new file mode 100644
index 000000000000..625aafa33b61
--- /dev/null
+++ b/lib/raid6/recov_avx512.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ */
+
+#ifdef CONFIG_AS_AVX512
+
+#include <linux/raid/pq.h>
+#include "x86.h"
+
+static int raid6_has_avx512(void)
+{
+	return boot_cpu_has(X86_FEATURE_AVX2) &&
+		boot_cpu_has(X86_FEATURE_AVX) &&
+		boot_cpu_has(X86_FEATURE_AVX512F) &&
+		boot_cpu_has(X86_FEATURE_AVX512BW) &&
+		boot_cpu_has(X86_FEATURE_AVX512VL) &&
+		boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
+				     int failb, void **ptrs)
+{
+	u8 *p, *q, *dp, *dq;
+	const u8 *pbmul;	/* P multiplier table for B data */
+	const u8 *qmul;		/* Q multiplier table (for both) */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data pages
+	 * Use the dead data pages as temporary storage for
+	 * delta p and delta q
+	 */
+
+	dp = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-2] = dp;
+	dq = (u8 *)ptrs[failb];
+	ptrs[failb] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dp;
+	ptrs[failb]   = dq;
+	ptrs[disks-2] = p;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+		raid6_gfexp[failb]]];
+
+	kernel_fpu_begin();
+
+	/* zmm0 = x0f[16] */
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm9\n\t"
+			     "vmovdqa64 %2, %%zmm0\n\t"
+			     "vmovdqa64 %3, %%zmm8\n\t"
+			     "vpxorq %4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %5, %%zmm9, %%zmm9\n\t"
+			     "vpxorq %6, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %7, %%zmm8, %%zmm8"
+			     :
+			     : "m" (q[0]), "m" (q[64]), "m" (p[0]),
+			       "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
+			       "m" (dp[0]), "m" (dp[64]));
+
+		/*
+		 * 1 = dq[0]  ^ q[0]
+		 * 9 = dq[64] ^ q[64]
+		 * 0 = dp[0]  ^ p[0]
+		 * 8 = dp[64] ^ p[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpsraw $4, %%zmm9, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/*
+		 * 5 = qx[0]
+		 * 15 = qx[64]
+		 */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm12, %%zmm13, %%zmm13"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		/*
+		 * 1  = pbmul[px[0]]
+		 * 13 = pbmul[px[64]]
+		 */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm15, %%zmm13, %%zmm13"
+			     :
+			     : );
+
+		/*
+		 * 1 = db = DQ
+		 * 13 = db[64] = DQ[64]
+		 */
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm13,%1\n\t"
+			     "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vpxorq %%zmm13, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]));
+
+		asm volatile("vmovdqa64 %%zmm0, %0\n\t"
+			     "vmovdqa64 %%zmm8, %1"
+			     :
+			     : "m" (dp[0]), "m" (dp[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dp += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+			     "vmovdqa64 %1, %%zmm0\n\t"
+			     "vpxorq %2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %3, %%zmm0, %%zmm0"
+			     :
+			     : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
+
+		/* 1 = dq ^ q;  0 = dp ^ p */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm5"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		/*
+		 * 1 = dq ^ q
+		 * 3 = dq ^ p >> 4
+		 */
+		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
+			     :
+			     : );
+
+		/* 5 = qx */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (pbmul[0]), "m" (pbmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
+			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm4, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = pbmul[px] */
+		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+			     /* 1 = db = DQ */
+			     "vmovdqa64 %%zmm1, %0\n\t"
+			     :
+			     : "m" (dq[0]));
+
+		asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+			     "vmovdqa64 %%zmm0, %0"
+			     :
+			     : "m" (dp[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dp += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
+				     void **ptrs)
+{
+	u8 *p, *q, *dq;
+	const u8 *qmul;		/* Q multiplier table */
+	const u8 x0f = 0x0f;
+
+	p = (u8 *)ptrs[disks-2];
+	q = (u8 *)ptrs[disks-1];
+
+	/*
+	 * Compute syndrome with zero for the missing data page
+	 * Use the dead data page as temporary storage for delta q
+	 */
+
+	dq = (u8 *)ptrs[faila];
+	ptrs[faila] = (void *)raid6_empty_zero_page;
+	ptrs[disks-1] = dq;
+
+	raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+	/* Restore pointer table */
+	ptrs[faila]   = dq;
+	ptrs[disks-1] = q;
+
+	/* Now, pick the proper data tables */
+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+	kernel_fpu_begin();
+
+	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+	while (bytes) {
+#ifdef CONFIG_X86_64
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vmovdqa64 %1, %%zmm8\n\t"
+			     "vpxorq %2, %%zmm3, %%zmm3\n\t"
+			     "vpxorq %3, %%zmm8, %%zmm8"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
+			       "m" (q[64]));
+
+		/*
+		 * 3 = q[0] ^ dq[0]
+		 * 8 = q[64] ^ dq[64]
+		 */
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vmovapd %%zmm0, %%zmm13\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1\n\t"
+			     "vmovapd %%zmm1, %%zmm14"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpsraw $4, %%zmm8, %%zmm12\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm13, %%zmm14, %%zmm14"
+			     :
+			     : );
+
+		/*
+		 * 1  = qmul[q[0]  ^ dq[0]]
+		 * 14 = qmul[q[64] ^ dq[64]]
+		 */
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vmovdqa64 %1, %%zmm12\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
+			     "vpxorq %%zmm14, %%zmm12, %%zmm12"
+			     :
+			     : "m" (p[0]), "m" (p[64]));
+
+		/*
+		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
+		 * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
+		 */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm14, %1\n\t"
+			     "vmovdqa64 %%zmm2, %2\n\t"
+			     "vmovdqa64 %%zmm12,%3"
+			     :
+			     : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
+			       "m" (p[64]));
+
+		bytes -= 128;
+		p += 128;
+		q += 128;
+		dq += 128;
+#else
+		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+			     "vpxorq %1, %%zmm3, %%zmm3"
+			     :
+			     : "m" (dq[0]), "m" (q[0]));
+
+		/* 3 = q ^ dq */
+
+		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+			     "vbroadcasti64x2 %1, %%zmm1"
+			     :
+			     : "m" (qmul[0]), "m" (qmul[16]));
+
+		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+			     "vpxorq %%zmm0, %%zmm1, %%zmm1"
+			     :
+			     : );
+
+		/* 1 = qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+			     "vpxorq %%zmm1, %%zmm2, %%zmm2"
+			     :
+			     : "m" (p[0]));
+
+		/* 2 = p ^ qmul[q ^ dq] */
+
+		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+			     "vmovdqa64 %%zmm2, %1"
+			     :
+			     : "m" (dq[0]), "m" (p[0]));
+
+		bytes -= 64;
+		p += 64;
+		q += 64;
+		dq += 64;
+#endif
+	}
+
+	kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx512 = {
+	.data2 = raid6_2data_recov_avx512,
+	.datap = raid6_datap_recov_avx512,
+	.valid = raid6_has_avx512,
+#ifdef CONFIG_X86_64
+	.name = "avx512x2",
+#else
+	.name = "avx512x1",
+#endif
+	.priority = 3,
+};
+
+#else
+#warning "your version of binutils lacks AVX512 support"
+#endif
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 29090f3db677..2c7b60edea04 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64)
 endif
 
 ifeq ($(IS_X86),yes)
-        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
+        OBJS   += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
         CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" |	\
                     gcc -c -x assembler - >&/dev/null &&	\
                     rm ./-.o && echo -DCONFIG_AS_AVX2=1)
+	CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" |          \
+		    gcc -c -x assembler - >&/dev/null &&        \
+		    rm ./-.o && echo -DCONFIG_AS_AVX512=1)
 else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c
index 3bebbabdb510..b07f4d8e6b03 100644
--- a/lib/raid6/test/test.c
+++ b/lib/raid6/test/test.c
@@ -21,12 +21,13 @@
 
 #define NDISKS		16	/* Including P and Q */
 
-const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
+const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
 struct raid6_calls raid6_call;
 
 char *dataptrs[NDISKS];
-char data[NDISKS][PAGE_SIZE];
-char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
+char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
 
 static void makedata(int start, int stop)
 {
diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h
index 8fe9d9662abb..834d268a4b05 100644
--- a/lib/raid6/x86.h
+++ b/lib/raid6/x86.h
@@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void)
 #define X86_FEATURE_SSSE3	(4*32+ 9) /* Supplemental SSE-3 */
 #define X86_FEATURE_AVX	(4*32+28) /* Advanced Vector Extensions */
 #define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_AVX512F     (9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ    (9*32+17) /* AVX-512 DQ (Double/Quad granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512BW    (9*32+30) /* AVX-512 BW (Byte/Word granular)
+					   * Instructions
+					   */
+#define X86_FEATURE_AVX512VL    (9*32+31) /* AVX-512 VL (128/256 Vector Length)
+					   * Extensions
+					   */
 #define X86_FEATURE_MMXEXT	(1*32+22) /* AMD MMX extensions */
 
 /* Should work well enough on modern CPUs for testing */
diff --git a/lib/random32.c b/lib/random32.c
index 69ed593aab07..fa594b1140e6 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void)
 }
 #endif
 
-static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
+static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
 
 /**
  *	prandom_u32_state - seeded pseudo-random number generator.
@@ -81,7 +81,7 @@ u32 prandom_u32(void)
 	u32 res;
 
 	res = prandom_u32_state(state);
-	put_cpu_var(state);
+	put_cpu_var(net_rand_state);
 
 	return res;
 }
@@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes)
 	struct rnd_state *state = &get_cpu_var(net_rand_state);
 
 	prandom_bytes_state(state, buf, bytes);
-	put_cpu_var(state);
+	put_cpu_var(net_rand_state);
 }
 EXPORT_SYMBOL(prandom_bytes);
 
diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb8a19fee110..1f8b112a7c35 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *
 				 *   (p)           (p)
 				 *   / \           / \
-				 *  N   S    -->  N   Sl
+				 *  N   S    -->  N   sl
 				 *     / \             \
-				 *    sl  Sr            s
+				 *    sl  Sr            S
 				 *                       \
 				 *                        Sr
+				 *
+				 * Note: p might be red, and then both
+				 * p and sl are red after rotation(which
+				 * breaks property 4). This is fixed in
+				 * Case 4 (in __rb_rotate_set_parents()
+				 *         which set sl the color of p
+				 *         and set p RB_BLACK)
+				 *
+				 *   (p)            (sl)
+				 *   / \            /  \
+				 *  N   sl   -->   P    S
+				 *       \        /      \
+				 *        S      N        Sr
+				 *         \
+				 *          Sr
 				 */
 				tmp1 = tmp2->rb_right;
 				WRITE_ONCE(sibling->rb_left, tmp1);
@@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 					}
 					break;
 				}
-				/* Case 3 - right rotate at sibling */
+				/* Case 3 - left rotate at sibling */
 				tmp1 = tmp2->rb_left;
 				WRITE_ONCE(sibling->rb_right, tmp1);
 				WRITE_ONCE(tmp2->rb_left, sibling);
@@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				tmp1 = sibling;
 				sibling = tmp2;
 			}
-			/* Case 4 - left rotate at parent + color flips */
+			/* Case 4 - right rotate at parent + color flips */
 			tmp2 = sibling->rb_right;
 			WRITE_ONCE(parent->rb_left, tmp2);
 			WRITE_ONCE(sibling->rb_right, parent);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 56054e541a0f..172454e6b979 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -32,6 +32,11 @@
 #define HASH_MIN_SIZE		4U
 #define BUCKET_LOCKS_PER_CPU	32UL
 
+union nested_table {
+	union nested_table __rcu *table;
+	struct rhash_head __rcu *bucket;
+};
+
 static u32 head_hashfn(struct rhashtable *ht,
 		       const struct bucket_table *tbl,
 		       const struct rhash_head *he)
@@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	/* Never allocate more than 0.5 locks per bucket */
 	size = min_t(unsigned int, size, tbl->size >> 1);
 
+	if (tbl->nest)
+		size = min(size, 1U << tbl->nest);
+
 	if (sizeof(spinlock_t) != 0) {
 		tbl->locks = NULL;
 #ifdef CONFIG_NUMA
@@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
 	return 0;
 }
 
+static void nested_table_free(union nested_table *ntbl, unsigned int size)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	const unsigned int len = 1 << shift;
+	unsigned int i;
+
+	ntbl = rcu_dereference_raw(ntbl->table);
+	if (!ntbl)
+		return;
+
+	if (size > len) {
+		size >>= shift;
+		for (i = 0; i < len; i++)
+			nested_table_free(ntbl + i, size);
+	}
+
+	kfree(ntbl);
+}
+
+static void nested_bucket_table_free(const struct bucket_table *tbl)
+{
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int len = 1 << tbl->nest;
+	union nested_table *ntbl;
+	unsigned int i;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+
+	for (i = 0; i < len; i++)
+		nested_table_free(ntbl + i, size);
+
+	kfree(ntbl);
+}
+
 static void bucket_table_free(const struct bucket_table *tbl)
 {
+	if (tbl->nest)
+		nested_bucket_table_free(tbl);
+
 	if (tbl)
 		kvfree(tbl->locks);
 
@@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head)
 	bucket_table_free(container_of(head, struct bucket_table, rcu));
 }
 
+static union nested_table *nested_table_alloc(struct rhashtable *ht,
+					      union nested_table __rcu **prev,
+					      unsigned int shifted,
+					      unsigned int nhash)
+{
+	union nested_table *ntbl;
+	int i;
+
+	ntbl = rcu_dereference(*prev);
+	if (ntbl)
+		return ntbl;
+
+	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+
+	if (ntbl && shifted) {
+		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++)
+			INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht,
+					    (i << shifted) | nhash);
+	}
+
+	rcu_assign_pointer(*prev, ntbl);
+
+	return ntbl;
+}
+
+static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
+						      size_t nbuckets,
+						      gfp_t gfp)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	struct bucket_table *tbl;
+	size_t size;
+
+	if (nbuckets < (1 << (shift + 1)))
+		return NULL;
+
+	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
+
+	tbl = kzalloc(size, gfp);
+	if (!tbl)
+		return NULL;
+
+	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
+				0, 0)) {
+		kfree(tbl);
+		return NULL;
+	}
+
+	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
+
+	return tbl;
+}
+
 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 					       size_t nbuckets,
 					       gfp_t gfp)
@@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
 	if (tbl == NULL && gfp == GFP_KERNEL)
 		tbl = vzalloc(size);
+
+	size = nbuckets;
+
+	if (tbl == NULL && gfp != GFP_KERNEL) {
+		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
+		nbuckets = 0;
+	}
 	if (tbl == NULL)
 		return NULL;
 
-	tbl->size = nbuckets;
+	tbl->size = size;
 
 	if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 		bucket_table_free(tbl);
@@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht,
 		rht_dereference_rcu(old_tbl->future_tbl, ht));
-	struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
-	int err = -ENOENT;
+	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
 	unsigned int new_hash;
 
+	if (new_tbl->nest)
+		goto out;
+
+	err = -ENOENT;
+
 	rht_for_each(entry, old_tbl, old_hash) {
 		err = 0;
 		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
@@ -202,19 +312,26 @@ out:
 	return err;
 }
 
-static void rhashtable_rehash_chain(struct rhashtable *ht,
+static int rhashtable_rehash_chain(struct rhashtable *ht,
 				    unsigned int old_hash)
 {
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	spinlock_t *old_bucket_lock;
+	int err;
 
 	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 
 	spin_lock_bh(old_bucket_lock);
-	while (!rhashtable_rehash_one(ht, old_hash))
+	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
-	old_tbl->rehash++;
+
+	if (err == -ENOENT) {
+		old_tbl->rehash++;
+		err = 0;
+	}
 	spin_unlock_bh(old_bucket_lock);
+
+	return err;
 }
 
 static int rhashtable_rehash_attach(struct rhashtable *ht,
@@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	struct bucket_table *new_tbl;
 	struct rhashtable_walker *walker;
 	unsigned int old_hash;
+	int err;
 
 	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 	if (!new_tbl)
 		return 0;
 
-	for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
-		rhashtable_rehash_chain(ht, old_hash);
+	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+		err = rhashtable_rehash_chain(ht, old_hash);
+		if (err)
+			return err;
+	}
 
 	/* Publish the new table pointer. */
 	rcu_assign_pointer(ht->tbl, new_tbl);
@@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
 
-/**
- * rhashtable_expand - Expand hash table while allowing concurrent lookups
- * @ht:		the hash table to expand
- *
- * A secondary bucket array is allocated and the hash entries are migrated.
- *
- * This function may only be called in a context where it is safe to call
- * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
- *
- * The caller must ensure that no concurrent resizing occurs by holding
- * ht->mutex.
- *
- * It is valid to have concurrent insertions and deletions protected by per
- * bucket locks or concurrent RCU protected lookups and traversals.
- */
-static int rhashtable_expand(struct rhashtable *ht)
+static int rhashtable_rehash_alloc(struct rhashtable *ht,
+				   struct bucket_table *old_tbl,
+				   unsigned int size)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *new_tbl;
 	int err;
 
 	ASSERT_RHT_MUTEX(ht);
 
-	old_tbl = rhashtable_last_table(ht, old_tbl);
-
-	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
+	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
@@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht)
  */
 static int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	unsigned int nelems = atomic_read(&ht->nelems);
 	unsigned int size = 0;
-	int err;
-
-	ASSERT_RHT_MUTEX(ht);
 
 	if (nelems)
 		size = roundup_pow_of_two(nelems * 3 / 2);
@@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht)
 	if (rht_dereference(old_tbl->future_tbl, ht))
 		return -EEXIST;
 
-	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
-	if (new_tbl == NULL)
-		return -ENOMEM;
-
-	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
-	if (err)
-		bucket_table_free(new_tbl);
-
-	return err;
+	return rhashtable_rehash_alloc(ht, old_tbl, size);
 }
 
 static void rht_deferred_worker(struct work_struct *work)
@@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work)
 	tbl = rhashtable_last_table(ht, tbl);
 
 	if (rht_grow_above_75(ht, tbl))
-		rhashtable_expand(ht);
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
-		rhashtable_shrink(ht);
+		err = rhashtable_shrink(ht);
+	else if (tbl->nest)
+		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 
-	err = rhashtable_rehash_table(ht);
+	if (!err)
+		err = rhashtable_rehash_table(ht);
 
 	mutex_unlock(&ht->mutex);
 
@@ -378,22 +476,8 @@ static void rht_deferred_worker(struct work_struct *work)
 		schedule_work(&ht->run_work);
 }
 
-static bool rhashtable_check_elasticity(struct rhashtable *ht,
-					struct bucket_table *tbl,
-					unsigned int hash)
-{
-	unsigned int elasticity = ht->elasticity;
-	struct rhash_head *head;
-
-	rht_for_each(head, tbl, hash)
-		if (!--elasticity)
-			return true;
-
-	return false;
-}
-
-int rhashtable_insert_rehash(struct rhashtable *ht,
-			     struct bucket_table *tbl)
+static int rhashtable_insert_rehash(struct rhashtable *ht,
+				    struct bucket_table *tbl)
 {
 	struct bucket_table *old_tbl;
 	struct bucket_table *new_tbl;
@@ -439,61 +523,177 @@ fail:
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(rhashtable_insert_rehash);
 
-struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
-					    const void *key,
-					    struct rhash_head *obj,
-					    struct bucket_table *tbl)
+static void *rhashtable_lookup_one(struct rhashtable *ht,
+				   struct bucket_table *tbl, unsigned int hash,
+				   const void *key, struct rhash_head *obj)
 {
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = key,
+	};
+	struct rhash_head __rcu **pprev;
 	struct rhash_head *head;
-	unsigned int hash;
-	int err;
+	int elasticity;
 
-	tbl = rhashtable_last_table(ht, tbl);
-	hash = head_hashfn(ht, tbl, obj);
-	spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
+	elasticity = ht->elasticity;
+	pprev = rht_bucket_var(tbl, hash);
+	rht_for_each_continue(head, *pprev, tbl, hash) {
+		struct rhlist_head *list;
+		struct rhlist_head *plist;
 
-	err = -EEXIST;
-	if (key && rhashtable_lookup_fast(ht, key, ht->p))
-		goto exit;
+		elasticity--;
+		if (!key ||
+		    (ht->p.obj_cmpfn ?
+		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
+		     rhashtable_compare(&arg, rht_obj(ht, head))))
+			continue;
+
+		if (!ht->rhlist)
+			return rht_obj(ht, head);
+
+		list = container_of(obj, struct rhlist_head, rhead);
+		plist = container_of(head, struct rhlist_head, rhead);
+
+		RCU_INIT_POINTER(list->next, plist);
+		head = rht_dereference_bucket(head->next, tbl, hash);
+		RCU_INIT_POINTER(list->rhead.next, head);
+		rcu_assign_pointer(*pprev, obj);
+
+		return NULL;
+	}
+
+	if (elasticity <= 0)
+		return ERR_PTR(-EAGAIN);
+
+	return ERR_PTR(-ENOENT);
+}
+
+static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
+						  struct bucket_table *tbl,
+						  unsigned int hash,
+						  struct rhash_head *obj,
+						  void *data)
+{
+	struct rhash_head __rcu **pprev;
+	struct bucket_table *new_tbl;
+	struct rhash_head *head;
+
+	if (!IS_ERR_OR_NULL(data))
+		return ERR_PTR(-EEXIST);
+
+	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
+
+	new_tbl = rcu_dereference(tbl->future_tbl);
+	if (new_tbl)
+		return new_tbl;
+
+	if (PTR_ERR(data) != -ENOENT)
+		return ERR_CAST(data);
 
-	err = -E2BIG;
 	if (unlikely(rht_grow_above_max(ht, tbl)))
-		goto exit;
+		return ERR_PTR(-E2BIG);
 
-	err = -EAGAIN;
-	if (rhashtable_check_elasticity(ht, tbl, hash) ||
-	    rht_grow_above_100(ht, tbl))
-		goto exit;
+	if (unlikely(rht_grow_above_100(ht, tbl)))
+		return ERR_PTR(-EAGAIN);
 
-	err = 0;
+	pprev = rht_bucket_insert(ht, tbl, hash);
+	if (!pprev)
+		return ERR_PTR(-ENOMEM);
 
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	head = rht_dereference_bucket(*pprev, tbl, hash);
 
 	RCU_INIT_POINTER(obj->next, head);
+	if (ht->rhlist) {
+		struct rhlist_head *list;
+
+		list = container_of(obj, struct rhlist_head, rhead);
+		RCU_INIT_POINTER(list->next, NULL);
+	}
 
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	rcu_assign_pointer(*pprev, obj);
 
 	atomic_inc(&ht->nelems);
+	if (rht_grow_above_75(ht, tbl))
+		schedule_work(&ht->run_work);
 
-exit:
-	spin_unlock(rht_bucket_lock(tbl, hash));
+	return NULL;
+}
 
-	if (err == 0)
-		return NULL;
-	else if (err == -EAGAIN)
-		return tbl;
-	else
-		return ERR_PTR(err);
+static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
+				   struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl;
+	struct bucket_table *tbl;
+	unsigned int hash;
+	spinlock_t *lock;
+	void *data;
+
+	tbl = rcu_dereference(ht->tbl);
+
+	/* All insertions must grab the oldest table containing
+	 * the hashed bucket that is yet to be rehashed.
+	 */
+	for (;;) {
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		lock = rht_bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+
+		if (tbl->rehash <= hash)
+			break;
+
+		spin_unlock_bh(lock);
+		tbl = rcu_dereference(tbl->future_tbl);
+	}
+
+	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+	if (PTR_ERR(new_tbl) != -EEXIST)
+		data = ERR_CAST(new_tbl);
+
+	while (!IS_ERR_OR_NULL(new_tbl)) {
+		tbl = new_tbl;
+		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
+		spin_lock_nested(rht_bucket_lock(tbl, hash),
+				 SINGLE_DEPTH_NESTING);
+
+		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
+		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
+		if (PTR_ERR(new_tbl) != -EEXIST)
+			data = ERR_CAST(new_tbl);
+
+		spin_unlock(rht_bucket_lock(tbl, hash));
+	}
+
+	spin_unlock_bh(lock);
+
+	if (PTR_ERR(data) == -EAGAIN)
+		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
+			       -EAGAIN);
+
+	return data;
+}
+
+void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
+			     struct rhash_head *obj)
+{
+	void *data;
+
+	do {
+		rcu_read_lock();
+		data = rhashtable_try_insert(ht, key, obj);
+		rcu_read_unlock();
+	} while (PTR_ERR(data) == -EAGAIN);
+
+	return data;
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 
 /**
- * rhashtable_walk_init - Initialise an iterator
+ * rhashtable_walk_enter - Initialise an iterator
  * @ht:		Table to walk over
  * @iter:	Hash table Iterator
- * @gfp:	GFP flags for allocations
  *
  * This function prepares a hash table walk.
  *
@@ -508,30 +708,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  * This function may sleep so you must not call it from interrupt
  * context or with spin locks held.
  *
- * You must call rhashtable_walk_exit if this function returns
- * successfully.
+ * You must call rhashtable_walk_exit after this function returns.
  */
-int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter,
-			 gfp_t gfp)
+void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
 {
 	iter->ht = ht;
 	iter->p = NULL;
 	iter->slot = 0;
 	iter->skip = 0;
 
-	iter->walker = kmalloc(sizeof(*iter->walker), gfp);
-	if (!iter->walker)
-		return -ENOMEM;
-
 	spin_lock(&ht->lock);
-	iter->walker->tbl =
+	iter->walker.tbl =
 		rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
-	list_add(&iter->walker->list, &iter->walker->tbl->walkers);
+	list_add(&iter->walker.list, &iter->walker.tbl->walkers);
 	spin_unlock(&ht->lock);
-
-	return 0;
 }
-EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
 
 /**
  * rhashtable_walk_exit - Free an iterator
@@ -542,10 +734,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init);
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
 	spin_lock(&iter->ht->lock);
-	if (iter->walker->tbl)
-		list_del(&iter->walker->list);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
 	spin_unlock(&iter->ht->lock);
-	kfree(iter->walker);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 
@@ -571,12 +762,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter)
 	rcu_read_lock();
 
 	spin_lock(&ht->lock);
-	if (iter->walker->tbl)
-		list_del(&iter->walker->list);
+	if (iter->walker.tbl)
+		list_del(&iter->walker.list);
 	spin_unlock(&ht->lock);
 
-	if (!iter->walker->tbl) {
-		iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht);
+	if (!iter->walker.tbl) {
+		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 		return -EAGAIN;
 	}
 
@@ -598,12 +789,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start);
  */
 void *rhashtable_walk_next(struct rhashtable_iter *iter)
 {
-	struct bucket_table *tbl = iter->walker->tbl;
+	struct bucket_table *tbl = iter->walker.tbl;
+	struct rhlist_head *list = iter->list;
 	struct rhashtable *ht = iter->ht;
 	struct rhash_head *p = iter->p;
+	bool rhlist = ht->rhlist;
 
 	if (p) {
-		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		if (!rhlist || !(list = rcu_dereference(list->next))) {
+			p = rcu_dereference(p->next);
+			list = container_of(p, struct rhlist_head, rhead);
+		}
 		goto next;
 	}
 
@@ -611,6 +807,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		int skip = iter->skip;
 
 		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (rhlist) {
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+				do {
+					if (!skip)
+						goto next;
+					skip--;
+					list = rcu_dereference(list->next);
+				} while (list);
+
+				continue;
+			}
 			if (!skip)
 				break;
 			skip--;
@@ -620,7 +828,8 @@ next:
 		if (!rht_is_a_nulls(p)) {
 			iter->skip++;
 			iter->p = p;
-			return rht_obj(ht, p);
+			iter->list = list;
+			return rht_obj(ht, rhlist ? &list->rhead : p);
 		}
 
 		iter->skip = 0;
@@ -631,8 +840,8 @@ next:
 	/* Ensure we see any new tables. */
 	smp_rmb();
 
-	iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	if (iter->walker->tbl) {
+	iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+	if (iter->walker.tbl) {
 		iter->slot = 0;
 		iter->skip = 0;
 		return ERR_PTR(-EAGAIN);
@@ -652,7 +861,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	__releases(RCU)
 {
 	struct rhashtable *ht;
-	struct bucket_table *tbl = iter->walker->tbl;
+	struct bucket_table *tbl = iter->walker.tbl;
 
 	if (!tbl)
 		goto out;
@@ -661,9 +870,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 
 	spin_lock(&ht->lock);
 	if (tbl->rehash < tbl->size)
-		list_add(&iter->walker->list, &tbl->walkers);
+		list_add(&iter->walker.list, &tbl->walkers);
 	else
-		iter->walker->tbl = NULL;
+		iter->walker.tbl = NULL;
 	spin_unlock(&ht->lock);
 
 	iter->p = NULL;
@@ -809,6 +1018,48 @@ int rhashtable_init(struct rhashtable *ht,
 EXPORT_SYMBOL_GPL(rhashtable_init);
 
 /**
+ * rhltable_init - initialize a new hash list table
+ * @hlt:	hash list table to be initialized
+ * @params:	configuration parameters
+ *
+ * Initializes a new hash list table.
+ *
+ * See documentation for rhashtable_init.
+ */
+int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
+{
+	int err;
+
+	/* No rhlist NULLs marking for now. */
+	if (params->nulls_base)
+		return -EINVAL;
+
+	err = rhashtable_init(&hlt->ht, params);
+	hlt->ht.rhlist = true;
+	return err;
+}
+EXPORT_SYMBOL_GPL(rhltable_init);
+
+static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
+				void (*free_fn)(void *ptr, void *arg),
+				void *arg)
+{
+	struct rhlist_head *list;
+
+	if (!ht->rhlist) {
+		free_fn(rht_obj(ht, obj), arg);
+		return;
+	}
+
+	list = container_of(obj, struct rhlist_head, rhead);
+	do {
+		obj = &list->rhead;
+		list = rht_dereference(list->next, ht);
+		free_fn(rht_obj(ht, obj), arg);
+	} while (list);
+}
+
+/**
  * rhashtable_free_and_destroy - free elements and destroy hash table
  * @ht:		the hash table to destroy
  * @free_fn:	callback to release resources of element
@@ -827,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 				 void (*free_fn)(void *ptr, void *arg),
 				 void *arg)
 {
-	const struct bucket_table *tbl;
+	struct bucket_table *tbl;
 	unsigned int i;
 
 	cancel_work_sync(&ht->run_work);
@@ -838,14 +1089,14 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 		for (i = 0; i < tbl->size; i++) {
 			struct rhash_head *pos, *next;
 
-			for (pos = rht_dereference(tbl->buckets[i], ht),
+			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL;
 			     !rht_is_a_nulls(pos);
 			     pos = next,
 			     next = !rht_is_a_nulls(pos) ?
 					rht_dereference(pos->next, ht) : NULL)
-				free_fn(rht_obj(ht, pos), arg);
+				rhashtable_free_one(ht, pos, free_fn, arg);
 		}
 	}
 
@@ -859,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht)
 	return rhashtable_free_and_destroy(ht, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
+
+struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
+					    unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	static struct rhash_head __rcu *rhnull =
+		(struct rhash_head __rcu *)NULLS_MARKER(0);
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	unsigned int subhash = hash;
+	union nested_table *ntbl;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+	subhash >>= tbl->nest;
+
+	while (ntbl && size > (1 << shift)) {
+		index = subhash & ((1 << shift) - 1);
+		ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash);
+		size >>= shift;
+		subhash >>= shift;
+	}
+
+	if (!ntbl)
+		return &rhnull;
+
+	return &ntbl[subhash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested);
+
+struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
+						   struct bucket_table *tbl,
+						   unsigned int hash)
+{
+	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
+	unsigned int index = hash & ((1 << tbl->nest) - 1);
+	unsigned int size = tbl->size >> tbl->nest;
+	union nested_table *ntbl;
+	unsigned int shifted;
+	unsigned int nhash;
+
+	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
+	hash >>= tbl->nest;
+	nhash = index;
+	shifted = tbl->nest;
+	ntbl = nested_table_alloc(ht, &ntbl[index].table,
+				  size <= (1 << shift) ? shifted : 0, nhash);
+
+	while (ntbl && size > (1 << shift)) {
+		index = hash & ((1 << shift) - 1);
+		size >>= shift;
+		hash >>= shift;
+		nhash |= index << shifted;
+		shifted += shift;
+		ntbl = nested_table_alloc(ht, &ntbl[index].table,
+					  size <= (1 << shift) ? shifted : 0,
+					  nhash);
+	}
+
+	if (!ntbl)
+		return NULL;
+
+	return &ntbl[hash].bucket;
+
+}
+EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
new file mode 100644
index 000000000000..55e11c4b2f3b
--- /dev/null
+++ b/lib/sbitmap.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 2016 Facebook
+ * Copyright (C) 2013-2014 Jens Axboe
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/random.h>
+#include <linux/sbitmap.h>
+#include <linux/seq_file.h>
+
+int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
+		      gfp_t flags, int node)
+{
+	unsigned int bits_per_word;
+	unsigned int i;
+
+	if (shift < 0) {
+		shift = ilog2(BITS_PER_LONG);
+		/*
+		 * If the bitmap is small, shrink the number of bits per word so
+		 * we spread over a few cachelines, at least. If less than 4
+		 * bits, just forget about it, it's not going to work optimally
+		 * anyway.
+		 */
+		if (depth >= 4) {
+			while ((4U << shift) > depth)
+				shift--;
+		}
+	}
+	bits_per_word = 1U << shift;
+	if (bits_per_word > BITS_PER_LONG)
+		return -EINVAL;
+
+	sb->shift = shift;
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	if (depth == 0) {
+		sb->map = NULL;
+		return 0;
+	}
+
+	sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node);
+	if (!sb->map)
+		return -ENOMEM;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_init_node);
+
+void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
+{
+	unsigned int bits_per_word = 1U << sb->shift;
+	unsigned int i;
+
+	sb->depth = depth;
+	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		sb->map[i].depth = min(depth, bits_per_word);
+		depth -= sb->map[i].depth;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_resize);
+
+static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
+			      bool wrap)
+{
+	unsigned int orig_hint = hint;
+	int nr;
+
+	while (1) {
+		nr = find_next_zero_bit(&word->word, word->depth, hint);
+		if (unlikely(nr >= word->depth)) {
+			/*
+			 * We started with an offset, and we didn't reset the
+			 * offset to 0 in a failure case, so start from 0 to
+			 * exhaust the map.
+			 */
+			if (orig_hint && hint && wrap) {
+				hint = orig_hint = 0;
+				continue;
+			}
+			return -1;
+		}
+
+		if (!test_and_set_bit(nr, &word->word))
+			break;
+
+		hint = nr + 1;
+		if (hint >= word->depth - 1)
+			hint = 0;
+	}
+
+	return nr;
+}
+
+int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index],
+					SB_NR_TO_BIT(sb, alloc_hint),
+					!round_robin);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get);
+
+bool sbitmap_any_bit_set(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		if (sb->map[i].word)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_set);
+
+bool sbitmap_any_bit_clear(const struct sbitmap *sb)
+{
+	unsigned int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+		unsigned long ret;
+
+		ret = find_first_zero_bit(&word->word, word->depth);
+		if (ret < word->depth)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
+
+unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	unsigned int i, weight = 0;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		const struct sbitmap_word *word = &sb->map[i];
+
+		weight += bitmap_weight(&word->word, word->depth);
+	}
+	return weight;
+}
+EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+	seq_printf(m, "depth=%u\n", sb->depth);
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
+	seq_printf(m, "map_nr=%u\n", sb->map_nr);
+}
+EXPORT_SYMBOL_GPL(sbitmap_show);
+
+static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte)
+{
+	if ((offset & 0xf) == 0) {
+		if (offset != 0)
+			seq_putc(m, '\n');
+		seq_printf(m, "%08x:", offset);
+	}
+	if ((offset & 0x1) == 0)
+		seq_putc(m, ' ');
+	seq_printf(m, "%02x", byte);
+}
+
+void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m)
+{
+	u8 byte = 0;
+	unsigned int byte_bits = 0;
+	unsigned int offset = 0;
+	int i;
+
+	for (i = 0; i < sb->map_nr; i++) {
+		unsigned long word = READ_ONCE(sb->map[i].word);
+		unsigned int word_bits = READ_ONCE(sb->map[i].depth);
+
+		while (word_bits > 0) {
+			unsigned int bits = min(8 - byte_bits, word_bits);
+
+			byte |= (word & (BIT(bits) - 1)) << byte_bits;
+			byte_bits += bits;
+			if (byte_bits == 8) {
+				emit_byte(m, offset, byte);
+				byte = 0;
+				byte_bits = 0;
+				offset++;
+			}
+			word >>= bits;
+			word_bits -= bits;
+		}
+	}
+	if (byte_bits) {
+		emit_byte(m, offset, byte);
+		offset++;
+	}
+	if (offset)
+		seq_putc(m, '\n');
+}
+EXPORT_SYMBOL_GPL(sbitmap_bitmap_show);
+
+static unsigned int sbq_calc_wake_batch(unsigned int depth)
+{
+	unsigned int wake_batch;
+
+	/*
+	 * For each batch, we wake up one queue. We need to make sure that our
+	 * batch size is small enough that the full depth of the bitmap is
+	 * enough to wake up all of the queues.
+	 */
+	wake_batch = SBQ_WAKE_BATCH;
+	if (wake_batch > depth / SBQ_WAIT_QUEUES)
+		wake_batch = max(1U, depth / SBQ_WAIT_QUEUES);
+
+	return wake_batch;
+}
+
+int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
+			    int shift, bool round_robin, gfp_t flags, int node)
+{
+	int ret;
+	int i;
+
+	ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node);
+	if (ret)
+		return ret;
+
+	sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags);
+	if (!sbq->alloc_hint) {
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
+	if (depth && !round_robin) {
+		for_each_possible_cpu(i)
+			*per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth;
+	}
+
+	sbq->wake_batch = sbq_calc_wake_batch(depth);
+	atomic_set(&sbq->wake_index, 0);
+
+	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
+	if (!sbq->ws) {
+		free_percpu(sbq->alloc_hint);
+		sbitmap_free(&sbq->sb);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		init_waitqueue_head(&sbq->ws[i].wait);
+		atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
+	}
+
+	sbq->round_robin = round_robin;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
+
+void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
+{
+	unsigned int wake_batch = sbq_calc_wake_batch(depth);
+	int i;
+
+	if (sbq->wake_batch != wake_batch) {
+		WRITE_ONCE(sbq->wake_batch, wake_batch);
+		/*
+		 * Pairs with the memory barrier in sbq_wake_up() to ensure that
+		 * the batch size is updated before the wait counts.
+		 */
+		smp_mb__before_atomic();
+		for (i = 0; i < SBQ_WAIT_QUEUES; i++)
+			atomic_set(&sbq->ws[i].wait_cnt, 1);
+	}
+	sbitmap_resize(&sbq->sb, depth);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_resize);
+
+int __sbitmap_queue_get(struct sbitmap_queue *sbq)
+{
+	unsigned int hint, depth;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sbq->sb.depth);
+	if (unlikely(hint >= depth)) {
+		hint = depth ? prandom_u32() % depth : 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+	nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+
+static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait)) {
+			int o = atomic_read(&sbq->wake_index);
+
+			if (wake_index != o)
+				atomic_cmpxchg(&sbq->wake_index, o, wake_index);
+			return ws;
+		}
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+
+	return NULL;
+}
+
+static void sbq_wake_up(struct sbitmap_queue *sbq)
+{
+	struct sbq_wait_state *ws;
+	unsigned int wake_batch;
+	int wait_cnt;
+
+	/*
+	 * Pairs with the memory barrier in set_current_state() to ensure the
+	 * proper ordering of clear_bit()/waitqueue_active() in the waker and
+	 * test_and_set_bit()/prepare_to_wait()/finish_wait() in the waiter. See
+	 * the comment on waitqueue_active(). This is __after_atomic because we
+	 * just did clear_bit() in the caller.
+	 */
+	smp_mb__after_atomic();
+
+	ws = sbq_wake_ptr(sbq);
+	if (!ws)
+		return;
+
+	wait_cnt = atomic_dec_return(&ws->wait_cnt);
+	if (wait_cnt <= 0) {
+		wake_batch = READ_ONCE(sbq->wake_batch);
+		/*
+		 * Pairs with the memory barrier in sbitmap_queue_resize() to
+		 * ensure that we see the batch size update before the wait
+		 * count is reset.
+		 */
+		smp_mb__before_atomic();
+		/*
+		 * If there are concurrent callers to sbq_wake_up(), the last
+		 * one to decrement the wait count below zero will bump it back
+		 * up. If there is a concurrent resize, the count reset will
+		 * either cause the cmpxchg to fail or overwrite after the
+		 * cmpxchg.
+		 */
+		atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch);
+		sbq_index_atomic_inc(&sbq->wake_index);
+		wake_up(&ws->wait);
+	}
+}
+
+void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
+			 unsigned int cpu)
+{
+	sbitmap_clear_bit(&sbq->sb, nr);
+	sbq_wake_up(sbq);
+	if (likely(!sbq->round_robin && nr < sbq->sb.depth))
+		*per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_clear);
+
+void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
+{
+	int i, wake_index;
+
+	/*
+	 * Pairs with the memory barrier in set_current_state() like in
+	 * sbq_wake_up().
+	 */
+	smp_mb();
+	wake_index = atomic_read(&sbq->wake_index);
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[wake_index];
+
+		if (waitqueue_active(&ws->wait))
+			wake_up(&ws->wait);
+
+		wake_index = sbq_index_inc(wake_index);
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all);
+
+void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
+{
+	bool first;
+	int i;
+
+	sbitmap_show(&sbq->sb, m);
+
+	seq_puts(m, "alloc_hint={");
+	first = true;
+	for_each_possible_cpu(i) {
+		if (!first)
+			seq_puts(m, ", ");
+		first = false;
+		seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i));
+	}
+	seq_puts(m, "}\n");
+
+	seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
+	seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+
+	seq_puts(m, "ws={\n");
+	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+		struct sbq_wait_state *ws = &sbq->ws[i];
+
+		seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
+			   atomic_read(&ws->wait_cnt),
+			   waitqueue_active(&ws->wait) ? "active" : "inactive");
+	}
+	seq_puts(m, "}\n");
+
+	seq_printf(m, "round_robin=%d\n", sbq->round_robin);
+}
+EXPORT_SYMBOL_GPL(sbitmap_queue_show);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 1feed6a2b12a..0beaa1d899aa 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -9,13 +9,13 @@
 #include <linux/quicklist.h>
 #include <linux/cma.h>
 
-void show_mem(unsigned int filter)
+void show_mem(unsigned int filter, nodemask_t *nodemask)
 {
 	pg_data_t *pgdat;
 	unsigned long total = 0, reserved = 0, highmem = 0;
 
 	printk("Mem-Info:\n");
-	show_free_areas(filter);
+	show_free_areas(filter, nodemask);
 
 	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
diff --git a/lib/siphash.c b/lib/siphash.c
new file mode 100644
index 000000000000..3ae58b4edad6
--- /dev/null
+++ b/lib/siphash.c
@@ -0,0 +1,551 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#include <linux/siphash.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+	do { \
+	v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+	v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+	v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+	v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+	} while (0)
+
+#define PREAMBLE(len) \
+	u64 v0 = 0x736f6d6570736575ULL; \
+	u64 v1 = 0x646f72616e646f6dULL; \
+	u64 v2 = 0x6c7967656e657261ULL; \
+	u64 v3 = 0x7465646279746573ULL; \
+	u64 b = ((u64)(len)) << 56; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define POSTAMBLE \
+	v3 ^= b; \
+	SIPROUND; \
+	SIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	SIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	PREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		SIPROUND;
+		SIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t *key)
+{
+	PREAMBLE(8)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
+{
+	PREAMBLE(16)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+		 const siphash_key_t *key)
+{
+	PREAMBLE(24)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+		 const u64 forth, const siphash_key_t *key)
+{
+	PREAMBLE(32)
+	v3 ^= first;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= forth;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t *key)
+{
+	PREAMBLE(4)
+	b |= first;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+		 const siphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	PREAMBLE(12)
+	v3 ^= combined;
+	SIPROUND;
+	SIPROUND;
+	v0 ^= combined;
+	b |= third;
+	POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
+ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
+ */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = le64_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= le32_to_cpup(data); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u64));
+	const u8 left = len & (sizeof(u64) - 1);
+	u64 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u64)) {
+		m = get_unaligned_le64(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+	if (left)
+		b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+						  bytemask_from_count(left)));
+#else
+	switch (left) {
+	case 7: b |= ((u64)end[6]) << 48;
+	case 6: b |= ((u64)end[5]) << 40;
+	case 5: b |= ((u64)end[4]) << 32;
+	case 4: b |= get_unaligned_le32(end); break;
+	case 3: b |= ((u64)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+#endif
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	b |= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(8)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(12)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	b |= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	u64 combined = (u64)second << 32 | first;
+	HPREAMBLE(16)
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	combined = (u64)forth << 32 | third;
+	v3 ^= combined;
+	HSIPROUND;
+	v0 ^= combined;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+	do { \
+	v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+	v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+	v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+	v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+	} while (0)
+
+#define HPREAMBLE(len) \
+	u32 v0 = 0; \
+	u32 v1 = 0; \
+	u32 v2 = 0x6c796765U; \
+	u32 v3 = 0x74656462U; \
+	u32 b = ((u32)(len)) << 24; \
+	v3 ^= key->key[1]; \
+	v2 ^= key->key[0]; \
+	v1 ^= key->key[1]; \
+	v0 ^= key->key[0];
+
+#define HPOSTAMBLE \
+	v3 ^= b; \
+	HSIPROUND; \
+	v0 ^= b; \
+	v2 ^= 0xff; \
+	HSIPROUND; \
+	HSIPROUND; \
+	HSIPROUND; \
+	return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = le32_to_cpup(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= le16_to_cpup(data); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+			 const hsiphash_key_t *key)
+{
+	const u8 *end = data + len - (len % sizeof(u32));
+	const u8 left = len & (sizeof(u32) - 1);
+	u32 m;
+	HPREAMBLE(len)
+	for (; data != end; data += sizeof(u32)) {
+		m = get_unaligned_le32(data);
+		v3 ^= m;
+		HSIPROUND;
+		v0 ^= m;
+	}
+	switch (left) {
+	case 3: b |= ((u32)end[2]) << 16;
+	case 2: b |= get_unaligned_le16(end); break;
+	case 1: b |= end[0];
+	}
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+	HPREAMBLE(4)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+	HPREAMBLE(8)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+		  const hsiphash_key_t *key)
+{
+	HPREAMBLE(12)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+		  const u32 forth, const hsiphash_key_t *key)
+{
+	HPREAMBLE(16)
+	v3 ^= first;
+	HSIPROUND;
+	v0 ^= first;
+	v3 ^= second;
+	HSIPROUND;
+	v0 ^= second;
+	v3 ^= third;
+	HSIPROUND;
+	v0 ^= third;
+	v3 ^= forth;
+	HSIPROUND;
+	v0 ^= forth;
+	HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 60f77f1d470a..f87d138e9672 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -50,7 +50,7 @@
 					STACK_ALLOC_ALIGN)
 #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
 		STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
-#define STACK_ALLOC_SLABS_CAP 1024
+#define STACK_ALLOC_SLABS_CAP 8192
 #define STACK_ALLOC_MAX_SLABS \
 	(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
 	 (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP)
@@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace)
 	trace->entries = stack->entries;
 	trace->skip = 0;
 }
+EXPORT_SYMBOL_GPL(depot_fetch_stack);
 
 /**
  * depot_save_stack - save stack in a stack depot.
@@ -283,3 +284,4 @@ exit:
 fast_exit:
 	return retval;
 }
+EXPORT_SYMBOL_GPL(depot_save_stack);
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 9c5fe8110413..7e35fc450c5b 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,6 +1,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/kasan-checks.h>
+#include <linux/thread_info.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
@@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 		long retval;
 
 		kasan_check_write(dst, count);
+		check_object_size(dst, count, false);
 		user_access_begin();
 		retval = do_strncpy_from_user(dst, src, count, max);
 		user_access_end();
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 22e13a0e19d7..a8d74a733a38 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -53,7 +53,7 @@
  */
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
 
-int swiotlb_force;
+enum swiotlb_force swiotlb_force;
 
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
@@ -83,6 +83,12 @@ static unsigned int *io_tlb_list;
 static unsigned int io_tlb_index;
 
 /*
+ * Max segment that we can provide which (if pages are contingous) will
+ * not be bounced (unless SWIOTLB_FORCE is set).
+ */
+unsigned int max_segment;
+
+/*
  * We need to save away the original address corresponding to a mapped entry
  * for the sync operations.
  */
@@ -106,8 +112,12 @@ setup_io_tlb_npages(char *str)
 	}
 	if (*str == ',')
 		++str;
-	if (!strcmp(str, "force"))
-		swiotlb_force = 1;
+	if (!strcmp(str, "force")) {
+		swiotlb_force = SWIOTLB_FORCE;
+	} else if (!strcmp(str, "noforce")) {
+		swiotlb_force = SWIOTLB_NO_FORCE;
+		io_tlb_nslabs = 1;
+	}
 
 	return 0;
 }
@@ -120,6 +130,20 @@ unsigned long swiotlb_nr_tbl(void)
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
 
+unsigned int swiotlb_max_segment(void)
+{
+	return max_segment;
+}
+EXPORT_SYMBOL_GPL(swiotlb_max_segment);
+
+void swiotlb_set_max_segment(unsigned int val)
+{
+	if (swiotlb_force == SWIOTLB_FORCE)
+		max_segment = 1;
+	else
+		max_segment = rounddown(val, PAGE_SIZE);
+}
+
 /* default to 64MB */
 #define IO_TLB_DEFAULT_SIZE (64UL<<20)
 unsigned long swiotlb_size_or_default(void)
@@ -201,6 +225,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 	if (verbose)
 		swiotlb_print_info();
 
+	swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
 	return 0;
 }
 
@@ -279,6 +304,7 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
 	if (rc)
 		free_pages((unsigned long)vstart, order);
+
 	return rc;
 }
 
@@ -333,6 +359,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 
 	late_alloc = 1;
 
+	swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT);
+
 	return 0;
 
 cleanup4:
@@ -347,6 +375,7 @@ cleanup2:
 	io_tlb_end = 0;
 	io_tlb_start = 0;
 	io_tlb_nslabs = 0;
+	max_segment = 0;
 	return -ENOMEM;
 }
 
@@ -375,6 +404,7 @@ void __init swiotlb_free(void)
 				   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
 	}
 	io_tlb_nslabs = 0;
+	max_segment = 0;
 }
 
 int is_swiotlb_buffer(phys_addr_t paddr)
@@ -425,7 +455,8 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 				   dma_addr_t tbl_dma_addr,
 				   phys_addr_t orig_addr, size_t size,
-				   enum dma_data_direction dir)
+				   enum dma_data_direction dir,
+				   unsigned long attrs)
 {
 	unsigned long flags;
 	phys_addr_t tlb_addr;
@@ -452,11 +483,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 		    : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
 
 	/*
-	 * For mappings greater than a page, we limit the stride (and
-	 * hence alignment) to a page size.
+	 * For mappings greater than or equal to a page, we limit the stride
+	 * (and hence alignment) to a page size.
 	 */
 	nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
-	if (size > PAGE_SIZE)
+	if (size >= PAGE_SIZE)
 		stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
 	else
 		stride = 1;
@@ -526,7 +557,8 @@ found:
 	 */
 	for (i = 0; i < nslots; i++)
 		io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
-	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
+	if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+	    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
 		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
 
 	return tlb_addr;
@@ -539,18 +571,27 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
 
 static phys_addr_t
 map_single(struct device *hwdev, phys_addr_t phys, size_t size,
-	   enum dma_data_direction dir)
+	   enum dma_data_direction dir, unsigned long attrs)
 {
-	dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	dma_addr_t start_dma_addr;
 
-	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
+	if (swiotlb_force == SWIOTLB_NO_FORCE) {
+		dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n",
+				     &phys);
+		return SWIOTLB_MAP_ERROR;
+	}
+
+	start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
+	return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size,
+				      dir, attrs);
 }
 
 /*
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
 void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
-			      size_t size, enum dma_data_direction dir)
+			      size_t size, enum dma_data_direction dir,
+			      unsigned long attrs)
 {
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -561,6 +602,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
 	 * First, sync the memory before unmapping the entry
 	 */
 	if (orig_addr != INVALID_PHYS_ADDR &&
+	    !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
 	    ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
 		swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
 
@@ -654,7 +696,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		 * GFP_DMA memory; fall back on map_single(), which
 		 * will grab memory from the lowest available address range.
 		 */
-		phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
+		phys_addr_t paddr = map_single(hwdev, 0, size,
+					       DMA_FROM_DEVICE, 0);
 		if (paddr == SWIOTLB_MAP_ERROR)
 			goto err_warn;
 
@@ -667,9 +710,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 			       (unsigned long long)dma_mask,
 			       (unsigned long long)dev_addr);
 
-			/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+			/*
+			 * DMA_TO_DEVICE to avoid memcpy in unmap_single.
+			 * The DMA_ATTR_SKIP_CPU_SYNC is optional.
+			 */
 			swiotlb_tbl_unmap_single(hwdev, paddr,
-						 size, DMA_TO_DEVICE);
+						 size, DMA_TO_DEVICE,
+						 DMA_ATTR_SKIP_CPU_SYNC);
 			goto err_warn;
 		}
 	}
@@ -698,8 +745,12 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 	if (!is_swiotlb_buffer(paddr))
 		free_pages((unsigned long)vaddr, get_order(size));
 	else
-		/* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE);
+		/*
+		 * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single.
+		 * DMA_ATTR_SKIP_CPU_SYNC is optional.
+		 */
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE,
+					 DMA_ATTR_SKIP_CPU_SYNC);
 }
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
@@ -707,6 +758,9 @@ static void
 swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
 	     int do_panic)
 {
+	if (swiotlb_force == SWIOTLB_NO_FORCE)
+		return;
+
 	/*
 	 * Ran out of IOMMU space for this operation. This is very bad.
 	 * Unfortunately the drivers cannot handle this operation properly.
@@ -714,8 +768,8 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
 	 * When the mapping is small enough return a static buffer to limit
 	 * the damage, or panic when the transfer is too big.
 	 */
-	printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at "
-	       "device %s\n", size, dev ? dev_name(dev) : "?");
+	dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n",
+			    size);
 
 	if (size <= io_tlb_overflow || !do_panic)
 		return;
@@ -749,13 +803,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
-	if (dma_capable(dev, dev_addr, size) && !swiotlb_force)
+	if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE)
 		return dev_addr;
 
 	trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
 
 	/* Oh well, have to allocate and map a bounce buffer. */
-	map = map_single(dev, phys, size, dir);
+	map = map_single(dev, phys, size, dir, attrs);
 	if (map == SWIOTLB_MAP_ERROR) {
 		swiotlb_full(dev, size, dir, 1);
 		return phys_to_dma(dev, io_tlb_overflow_buffer);
@@ -764,12 +818,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	dev_addr = phys_to_dma(dev, map);
 
 	/* Ensure that the address returned is DMA'ble */
-	if (!dma_capable(dev, dev_addr, size)) {
-		swiotlb_tbl_unmap_single(dev, map, size, dir);
-		return phys_to_dma(dev, io_tlb_overflow_buffer);
-	}
+	if (dma_capable(dev, dev_addr, size))
+		return dev_addr;
 
-	return dev_addr;
+	attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+	swiotlb_tbl_unmap_single(dev, map, size, dir, attrs);
+
+	return phys_to_dma(dev, io_tlb_overflow_buffer);
 }
 EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
@@ -782,14 +837,15 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
  * whatever the device wrote there.
  */
 static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
-			 size_t size, enum dma_data_direction dir)
+			 size_t size, enum dma_data_direction dir,
+			 unsigned long attrs)
 {
 	phys_addr_t paddr = dma_to_phys(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
 	if (is_swiotlb_buffer(paddr)) {
-		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
+		swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
 		return;
 	}
 
@@ -809,7 +865,7 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
 			size_t size, enum dma_data_direction dir,
 			unsigned long attrs)
 {
-	unmap_single(hwdev, dev_addr, size, dir);
+	unmap_single(hwdev, dev_addr, size, dir, attrs);
 }
 EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
@@ -888,14 +944,15 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 		phys_addr_t paddr = sg_phys(sg);
 		dma_addr_t dev_addr = phys_to_dma(hwdev, paddr);
 
-		if (swiotlb_force ||
+		if (swiotlb_force == SWIOTLB_FORCE ||
 		    !dma_capable(hwdev, dev_addr, sg->length)) {
 			phys_addr_t map = map_single(hwdev, sg_phys(sg),
-						     sg->length, dir);
+						     sg->length, dir, attrs);
 			if (map == SWIOTLB_MAP_ERROR) {
 				/* Don't panic here, we expect map_sg users
 				   to do proper error handling. */
 				swiotlb_full(hwdev, sg->length, dir, 0);
+				attrs |= DMA_ATTR_SKIP_CPU_SYNC;
 				swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
 						       attrs);
 				sg_dma_len(sgl) = 0;
@@ -910,14 +967,6 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
 }
 EXPORT_SYMBOL(swiotlb_map_sg_attrs);
 
-int
-swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-	       enum dma_data_direction dir)
-{
-	return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0);
-}
-EXPORT_SYMBOL(swiotlb_map_sg);
-
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
  * concerning calls here are the same as for swiotlb_unmap_page() above.
@@ -933,19 +982,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 	BUG_ON(dir == DMA_NONE);
 
 	for_each_sg(sgl, sg, nelems, i)
-		unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir);
-
+		unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir,
+			     attrs);
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
 
-void
-swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
-		 enum dma_data_direction dir)
-{
-	return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0);
-}
-EXPORT_SYMBOL(swiotlb_unmap_sg);
-
 /*
  * Make physical memory consistent for a set of streaming mode DMA translations
  * after a transfer.
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 93f45011a59d..0362da0b66c3 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -4831,7 +4831,7 @@ static struct bpf_test tests[] = {
 		{ },
 		INTERNAL,
 		{ 0x34 },
-		{ { 1, 0xbef } },
+		{ { ETH_HLEN, 0xbef } },
 		.fill_helper = bpf_fill_ld_abs_vlan_push_pop,
 	},
 	/*
@@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size)
 	skb->hash = SKB_HASH;
 	skb->queue_mapping = SKB_QUEUE_MAP;
 	skb->vlan_tci = SKB_VLAN_TCI;
+	skb->vlan_proto = htons(ETH_P_IP);
 	skb->dev = &dev;
 	skb->dev->ifindex = SKB_DEV_IFINDEX;
 	skb->dev->type = SKB_DEV_TYPE;
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index a3e8ec3fb1c5..09371b0a9baf 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -42,12 +42,6 @@ static const struct file_operations test_fw_fops = {
 	.read           = test_fw_misc_read,
 };
 
-static struct miscdevice test_fw_misc_device = {
-	.minor          = MISC_DYNAMIC_MINOR,
-	.name           = "test_firmware",
-	.fops           = &test_fw_fops,
-};
-
 static ssize_t trigger_request_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
@@ -132,39 +126,81 @@ out:
 }
 static DEVICE_ATTR_WO(trigger_async_request);
 
-static int __init test_firmware_init(void)
+static ssize_t trigger_custom_fallback_store(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
 {
 	int rc;
+	char *name;
 
-	rc = misc_register(&test_fw_misc_device);
+	name = kstrndup(buf, count, GFP_KERNEL);
+	if (!name)
+		return -ENOSPC;
+
+	pr_info("loading '%s' using custom fallback mechanism\n", name);
+
+	mutex_lock(&test_fw_mutex);
+	release_firmware(test_firmware);
+	test_firmware = NULL;
+	rc = request_firmware_nowait(THIS_MODULE, FW_ACTION_NOHOTPLUG, name,
+				     dev, GFP_KERNEL, NULL,
+				     trigger_async_request_cb);
 	if (rc) {
-		pr_err("could not register misc device: %d\n", rc);
-		return rc;
+		pr_info("async load of '%s' failed: %d\n", name, rc);
+		kfree(name);
+		goto out;
 	}
-	rc = device_create_file(test_fw_misc_device.this_device,
-				&dev_attr_trigger_request);
-	if (rc) {
-		pr_err("could not create sysfs interface: %d\n", rc);
-		goto dereg;
+	/* Free 'name' ASAP, to test for race conditions */
+	kfree(name);
+
+	wait_for_completion(&async_fw_done);
+
+	if (test_firmware) {
+		pr_info("loaded: %zu\n", test_firmware->size);
+		rc = count;
+	} else {
+		pr_err("failed to async load firmware\n");
+		rc = -ENODEV;
 	}
 
-	rc = device_create_file(test_fw_misc_device.this_device,
-				&dev_attr_trigger_async_request);
+out:
+	mutex_unlock(&test_fw_mutex);
+
+	return rc;
+}
+static DEVICE_ATTR_WO(trigger_custom_fallback);
+
+#define TEST_FW_DEV_ATTR(name)          &dev_attr_##name.attr
+
+static struct attribute *test_dev_attrs[] = {
+	TEST_FW_DEV_ATTR(trigger_request),
+	TEST_FW_DEV_ATTR(trigger_async_request),
+	TEST_FW_DEV_ATTR(trigger_custom_fallback),
+	NULL,
+};
+
+ATTRIBUTE_GROUPS(test_dev);
+
+static struct miscdevice test_fw_misc_device = {
+	.minor          = MISC_DYNAMIC_MINOR,
+	.name           = "test_firmware",
+	.fops           = &test_fw_fops,
+	.groups 	= test_dev_groups,
+};
+
+static int __init test_firmware_init(void)
+{
+	int rc;
+
+	rc = misc_register(&test_fw_misc_device);
 	if (rc) {
-		pr_err("could not create async sysfs interface: %d\n", rc);
-		goto remove_file;
+		pr_err("could not register misc device: %d\n", rc);
+		return rc;
 	}
 
 	pr_warn("interface ready\n");
 
 	return 0;
-
-remove_file:
-	device_remove_file(test_fw_misc_device.this_device,
-			   &dev_attr_trigger_async_request);
-dereg:
-	misc_deregister(&test_fw_misc_device);
-	return rc;
 }
 
 module_init(test_firmware_init);
@@ -172,10 +208,6 @@ module_init(test_firmware_init);
 static void __exit test_firmware_exit(void)
 {
 	release_firmware(test_firmware);
-	device_remove_file(test_fw_misc_device.this_device,
-			   &dev_attr_trigger_async_request);
-	device_remove_file(test_fw_misc_device.this_device,
-			   &dev_attr_trigger_request);
 	misc_deregister(&test_fw_misc_device);
 	pr_warn("removed interface\n");
 }
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 5e51872b3fc1..fbdf87920093 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -20,6 +20,11 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 
+/*
+ * Note: test functions are marked noinline so that their names appear in
+ * reports.
+ */
+
 static noinline void __init kmalloc_oob_right(void)
 {
 	char *ptr;
@@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void)
 	kfree(kmem);
 }
 
+static noinline void __init use_after_scope_test(void)
+{
+	volatile char *volatile p;
+
+	pr_info("use-after-scope on int\n");
+	{
+		int local = 0;
+
+		p = (char *)&local;
+	}
+	p[0] = 1;
+	p[3] = 1;
+
+	pr_info("use-after-scope on array\n");
+	{
+		char local[1024] = {0};
+
+		p = local;
+	}
+	p[0] = 1;
+	p[1023] = 1;
+}
+
 static int __init kmalloc_tests_init(void)
 {
 	kmalloc_oob_right();
@@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void)
 	kasan_global_oob();
 	ksize_unpoisons_memory();
 	copy_user_test();
+	use_after_scope_test();
 	return -EAGAIN;
 }
 
diff --git a/lib/test_parman.c b/lib/test_parman.c
new file mode 100644
index 000000000000..fe9f3a785804
--- /dev/null
+++ b/lib/test_parman.c
@@ -0,0 +1,395 @@
+/*
+ * lib/test_parman.c - Test module for parman
+ * Copyright (c) 2017 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/parman.h>
+
+#define TEST_PARMAN_PRIO_SHIFT 7 /* defines number of prios for testing */
+#define TEST_PARMAN_PRIO_COUNT BIT(TEST_PARMAN_PRIO_SHIFT)
+#define TEST_PARMAN_PRIO_MASK (TEST_PARMAN_PRIO_COUNT - 1)
+
+#define TEST_PARMAN_ITEM_SHIFT 13 /* defines a total number
+				   * of items for testing
+				   */
+#define TEST_PARMAN_ITEM_COUNT BIT(TEST_PARMAN_ITEM_SHIFT)
+#define TEST_PARMAN_ITEM_MASK (TEST_PARMAN_ITEM_COUNT - 1)
+
+#define TEST_PARMAN_BASE_SHIFT 8
+#define TEST_PARMAN_BASE_COUNT BIT(TEST_PARMAN_BASE_SHIFT)
+#define TEST_PARMAN_RESIZE_STEP_SHIFT 7
+#define TEST_PARMAN_RESIZE_STEP_COUNT BIT(TEST_PARMAN_RESIZE_STEP_SHIFT)
+
+#define TEST_PARMAN_BULK_MAX_SHIFT (2 + TEST_PARMAN_RESIZE_STEP_SHIFT)
+#define TEST_PARMAN_BULK_MAX_COUNT BIT(TEST_PARMAN_BULK_MAX_SHIFT)
+#define TEST_PARMAN_BULK_MAX_MASK (TEST_PARMAN_BULK_MAX_COUNT - 1)
+
+#define TEST_PARMAN_RUN_BUDGET (TEST_PARMAN_ITEM_COUNT * 256)
+
+struct test_parman_prio {
+	struct parman_prio parman_prio;
+	unsigned long priority;
+};
+
+struct test_parman_item {
+	struct parman_item parman_item;
+	struct test_parman_prio *prio;
+	bool used;
+};
+
+struct test_parman {
+	struct parman *parman;
+	struct test_parman_item **prio_array;
+	unsigned long prio_array_limit;
+	struct test_parman_prio prios[TEST_PARMAN_PRIO_COUNT];
+	struct test_parman_item items[TEST_PARMAN_ITEM_COUNT];
+	struct rnd_state rnd;
+	unsigned long run_budget;
+	unsigned long bulk_budget;
+	bool bulk_noop;
+	unsigned int used_items;
+};
+
+#define ITEM_PTRS_SIZE(count) (sizeof(struct test_parman_item *) * (count))
+
+static int test_parman_resize(void *priv, unsigned long new_count)
+{
+	struct test_parman *test_parman = priv;
+	struct test_parman_item **prio_array;
+	unsigned long old_count;
+
+	prio_array = krealloc(test_parman->prio_array,
+			      ITEM_PTRS_SIZE(new_count), GFP_KERNEL);
+	if (new_count == 0)
+		return 0;
+	if (!prio_array)
+		return -ENOMEM;
+	old_count = test_parman->prio_array_limit;
+	if (new_count > old_count)
+		memset(&prio_array[old_count], 0,
+		       ITEM_PTRS_SIZE(new_count - old_count));
+	test_parman->prio_array = prio_array;
+	test_parman->prio_array_limit = new_count;
+	return 0;
+}
+
+static void test_parman_move(void *priv, unsigned long from_index,
+			     unsigned long to_index, unsigned long count)
+{
+	struct test_parman *test_parman = priv;
+	struct test_parman_item **prio_array = test_parman->prio_array;
+
+	memmove(&prio_array[to_index], &prio_array[from_index],
+		ITEM_PTRS_SIZE(count));
+	memset(&prio_array[from_index], 0, ITEM_PTRS_SIZE(count));
+}
+
+static const struct parman_ops test_parman_lsort_ops = {
+	.base_count	= TEST_PARMAN_BASE_COUNT,
+	.resize_step	= TEST_PARMAN_RESIZE_STEP_COUNT,
+	.resize		= test_parman_resize,
+	.move		= test_parman_move,
+	.algo		= PARMAN_ALGO_TYPE_LSORT,
+};
+
+static void test_parman_rnd_init(struct test_parman *test_parman)
+{
+	prandom_seed_state(&test_parman->rnd, 3141592653589793238ULL);
+}
+
+static u32 test_parman_rnd_get(struct test_parman *test_parman)
+{
+	return prandom_u32_state(&test_parman->rnd);
+}
+
+static unsigned long test_parman_priority_gen(struct test_parman *test_parman)
+{
+	unsigned long priority;
+	int i;
+
+again:
+	priority = test_parman_rnd_get(test_parman);
+	if (priority == 0)
+		goto again;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		if (prio->priority == 0)
+			break;
+		if (prio->priority == priority)
+			goto again;
+	}
+	return priority;
+}
+
+static void test_parman_prios_init(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		/* Assign random uniqueue priority to each prio structure */
+		prio->priority = test_parman_priority_gen(test_parman);
+		parman_prio_init(test_parman->parman, &prio->parman_prio,
+				 prio->priority);
+	}
+}
+
+static void test_parman_prios_fini(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) {
+		struct test_parman_prio *prio = &test_parman->prios[i];
+
+		parman_prio_fini(&prio->parman_prio);
+	}
+}
+
+static void test_parman_items_init(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) {
+		struct test_parman_item *item = &test_parman->items[i];
+		unsigned int prio_index = test_parman_rnd_get(test_parman) &
+					  TEST_PARMAN_PRIO_MASK;
+
+		/* Assign random prio to each item structure */
+		item->prio = &test_parman->prios[prio_index];
+	}
+}
+
+static void test_parman_items_fini(struct test_parman *test_parman)
+{
+	int i;
+
+	for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) {
+		struct test_parman_item *item = &test_parman->items[i];
+
+		if (!item->used)
+			continue;
+		parman_item_remove(test_parman->parman,
+				   &item->prio->parman_prio,
+				   &item->parman_item);
+	}
+}
+
+static struct test_parman *test_parman_create(const struct parman_ops *ops)
+{
+	struct test_parman *test_parman;
+	int err;
+
+	test_parman = kzalloc(sizeof(*test_parman), GFP_KERNEL);
+	if (!test_parman)
+		return ERR_PTR(-ENOMEM);
+	err = test_parman_resize(test_parman, TEST_PARMAN_BASE_COUNT);
+	if (err)
+		goto err_resize;
+	test_parman->parman = parman_create(ops, test_parman);
+	if (!test_parman->parman) {
+		err = -ENOMEM;
+		goto err_parman_create;
+	}
+	test_parman_rnd_init(test_parman);
+	test_parman_prios_init(test_parman);
+	test_parman_items_init(test_parman);
+	test_parman->run_budget = TEST_PARMAN_RUN_BUDGET;
+	return test_parman;
+
+err_parman_create:
+	test_parman_resize(test_parman, 0);
+err_resize:
+	kfree(test_parman);
+	return ERR_PTR(err);
+}
+
+static void test_parman_destroy(struct test_parman *test_parman)
+{
+	test_parman_items_fini(test_parman);
+	test_parman_prios_fini(test_parman);
+	parman_destroy(test_parman->parman);
+	test_parman_resize(test_parman, 0);
+	kfree(test_parman);
+}
+
+static bool test_parman_run_check_budgets(struct test_parman *test_parman)
+{
+	if (test_parman->run_budget-- == 0)
+		return false;
+	if (test_parman->bulk_budget-- != 0)
+		return true;
+
+	test_parman->bulk_budget = test_parman_rnd_get(test_parman) &
+				   TEST_PARMAN_BULK_MAX_MASK;
+	test_parman->bulk_noop = test_parman_rnd_get(test_parman) & 1;
+	return true;
+}
+
+static int test_parman_run(struct test_parman *test_parman)
+{
+	unsigned int i = test_parman_rnd_get(test_parman);
+	int err;
+
+	while (test_parman_run_check_budgets(test_parman)) {
+		unsigned int item_index = i++ & TEST_PARMAN_ITEM_MASK;
+		struct test_parman_item *item = &test_parman->items[item_index];
+
+		if (test_parman->bulk_noop)
+			continue;
+
+		if (!item->used) {
+			err = parman_item_add(test_parman->parman,
+					      &item->prio->parman_prio,
+					      &item->parman_item);
+			if (err)
+				return err;
+			test_parman->prio_array[item->parman_item.index] = item;
+			test_parman->used_items++;
+		} else {
+			test_parman->prio_array[item->parman_item.index] = NULL;
+			parman_item_remove(test_parman->parman,
+					   &item->prio->parman_prio,
+					   &item->parman_item);
+			test_parman->used_items--;
+		}
+		item->used = !item->used;
+	}
+	return 0;
+}
+
+static int test_parman_check_array(struct test_parman *test_parman,
+				   bool gaps_allowed)
+{
+	unsigned int last_unused_items = 0;
+	unsigned long last_priority = 0;
+	unsigned int used_items = 0;
+	int i;
+
+	if (test_parman->prio_array_limit < TEST_PARMAN_BASE_COUNT) {
+		pr_err("Array limit is lower than the base count (%lu < %lu)\n",
+		       test_parman->prio_array_limit, TEST_PARMAN_BASE_COUNT);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < test_parman->prio_array_limit; i++) {
+		struct test_parman_item *item = test_parman->prio_array[i];
+
+		if (!item) {
+			last_unused_items++;
+			continue;
+		}
+		if (last_unused_items && !gaps_allowed) {
+			pr_err("Gap found in array even though they are forbidden\n");
+			return -EINVAL;
+		}
+
+		last_unused_items = 0;
+		used_items++;
+
+		if (item->prio->priority < last_priority) {
+			pr_err("Item belongs under higher priority then the last one (current: %lu, previous: %lu)\n",
+			       item->prio->priority, last_priority);
+			return -EINVAL;
+		}
+		last_priority = item->prio->priority;
+
+		if (item->parman_item.index != i) {
+			pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n",
+			       item->parman_item.index, i);
+			return -EINVAL;
+		}
+	}
+
+	if (used_items != test_parman->used_items) {
+		pr_err("Number of used items in array does not match (%u != %u)\n",
+		       used_items, test_parman->used_items);
+		return -EINVAL;
+	}
+
+	if (last_unused_items >= TEST_PARMAN_RESIZE_STEP_COUNT) {
+		pr_err("Number of unused item at the end of array is bigger than resize step (%u >= %lu)\n",
+		       last_unused_items, TEST_PARMAN_RESIZE_STEP_COUNT);
+		return -EINVAL;
+	}
+
+	pr_info("Priority array check successful\n");
+
+	return 0;
+}
+
+static int test_parman_lsort(void)
+{
+	struct test_parman *test_parman;
+	int err;
+
+	test_parman = test_parman_create(&test_parman_lsort_ops);
+	if (IS_ERR(test_parman))
+		return PTR_ERR(test_parman);
+
+	err = test_parman_run(test_parman);
+	if (err)
+		goto out;
+
+	err = test_parman_check_array(test_parman, false);
+	if (err)
+		goto out;
+out:
+	test_parman_destroy(test_parman);
+	return err;
+}
+
+static int __init test_parman_init(void)
+{
+	return test_parman_lsort();
+}
+
+static void __exit test_parman_exit(void)
+{
+}
+
+module_init(test_parman_init);
+module_exit(test_parman_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Test module for parman");
diff --git a/lib/test_siphash.c b/lib/test_siphash.c
new file mode 100644
index 000000000000..a6d854d933bf
--- /dev/null
+++ b/lib/test_siphash.c
@@ -0,0 +1,223 @@
+/* Test cases for siphash.c
+ *
+ * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/siphash.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/* Test vectors taken from reference source available at:
+ *     https://github.com/veorq/SipHash
+ */
+
+static const siphash_key_t test_key_siphash =
+	{{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
+
+static const u64 test_vectors_siphash[64] = {
+	0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL,
+	0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL,
+	0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL,
+	0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL,
+	0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL,
+	0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL,
+	0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL,
+	0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL,
+	0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL,
+	0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL,
+	0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL,
+	0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL,
+	0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL,
+	0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL,
+	0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL,
+	0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL,
+	0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL,
+	0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL,
+	0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL,
+	0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL,
+	0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL,
+	0x958a324ceb064572ULL
+};
+
+#if BITS_PER_LONG == 64
+static const hsiphash_key_t test_key_hsiphash =
+	{{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }};
+
+static const u32 test_vectors_hsiphash[64] = {
+	0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU,
+	0xe7ddf7fbU, 0x88d38328U, 0x49533b67U,
+	0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU,
+	0x6c063de4U, 0x92ff097fU, 0xf94dc352U,
+	0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U,
+	0x2a519956U, 0x7d908b66U, 0x63dbd80cU,
+	0xb473e63eU, 0x8d297d1cU, 0xa6cce040U,
+	0x2b45f844U, 0xa320872eU, 0xdae6c123U,
+	0x67349c8cU, 0x705b0979U, 0xca9913a5U,
+	0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U,
+	0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU,
+	0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U,
+	0xada26206U, 0xa3c33057U, 0xae3a36a1U,
+	0x7b108392U, 0x99e41531U, 0x3f1ad944U,
+	0xc8138825U, 0xc28949a6U, 0xfaf8876bU,
+	0x9f042196U, 0x68b1d623U, 0x8b5114fdU,
+	0xdf074c46U, 0x12cc86b3U, 0x0a52098fU,
+	0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U,
+	0x73f0bce6U, 0x70a7e980U, 0x243c6d75U,
+	0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U,
+	0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U,
+	0xb7bbb3a8U
+};
+#else
+static const hsiphash_key_t test_key_hsiphash =
+	{{ 0x03020100U, 0x07060504U }};
+
+static const u32 test_vectors_hsiphash[64] = {
+	0x5814c896U, 0xe7e864caU, 0xbc4b0e30U,
+	0x01539939U, 0x7e059ea6U, 0x88e3d89bU,
+	0xa0080b65U, 0x9d38d9d6U, 0x577999b1U,
+	0xc839caedU, 0xe4fa32cfU, 0x959246eeU,
+	0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU,
+	0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU,
+	0x06712339U, 0x522aca67U, 0x911bb605U,
+	0x90a65f0eU, 0xf826ef7bU, 0x62512debU,
+	0x57150ad7U, 0x5d473507U, 0x1ec47442U,
+	0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U,
+	0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU,
+	0xe0f6c934U, 0xb0652033U, 0x9b9851ccU,
+	0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU,
+	0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU,
+	0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U,
+	0x65671619U, 0x9f5fff91U, 0xd89c5267U,
+	0x007783ebU, 0x95766243U, 0xab639262U,
+	0x9c7e1390U, 0xc368dda6U, 0x38ddc455U,
+	0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU,
+	0x2ee80657U, 0x33dbb66aU, 0xae3f0577U,
+	0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U,
+	0x87178304U
+};
+#endif
+
+static int __init siphash_test_init(void)
+{
+	u8 in[64] __aligned(SIPHASH_ALIGNMENT);
+	u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT);
+	u8 i;
+	int ret = 0;
+
+	for (i = 0; i < 64; ++i) {
+		in[i] = i;
+		in_unaligned[i + 1] = i;
+		if (siphash(in, i, &test_key_siphash) !=
+						test_vectors_siphash[i]) {
+			pr_info("siphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (siphash(in_unaligned + 1, i, &test_key_siphash) !=
+						test_vectors_siphash[i]) {
+			pr_info("siphash self-test unaligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (hsiphash(in, i, &test_key_hsiphash) !=
+						test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+		if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) !=
+						test_vectors_hsiphash[i]) {
+			pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1);
+			ret = -EINVAL;
+		}
+	}
+	if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) !=
+						test_vectors_siphash[8]) {
+		pr_info("siphash self-test 1u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 &test_key_siphash) != test_vectors_siphash[16]) {
+		pr_info("siphash self-test 2u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, &test_key_siphash) !=
+						test_vectors_siphash[24]) {
+		pr_info("siphash self-test 3u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL,
+			 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL,
+			 &test_key_siphash) != test_vectors_siphash[32]) {
+		pr_info("siphash self-test 4u64: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_1u32(0x03020100U, &test_key_siphash) !=
+						test_vectors_siphash[4]) {
+		pr_info("siphash self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) !=
+						test_vectors_siphash[8]) {
+		pr_info("siphash self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_3u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, &test_key_siphash) !=
+						test_vectors_siphash[12]) {
+		pr_info("siphash self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (siphash_4u32(0x03020100U, 0x07060504U,
+			 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) !=
+						test_vectors_siphash[16]) {
+		pr_info("siphash self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[4]) {
+		pr_info("hsiphash self-test 1u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[8]) {
+		pr_info("hsiphash self-test 2u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_3u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, &test_key_hsiphash) !=
+						test_vectors_hsiphash[12]) {
+		pr_info("hsiphash self-test 3u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (hsiphash_4u32(0x03020100U, 0x07060504U,
+			  0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) !=
+						test_vectors_hsiphash[16]) {
+		pr_info("hsiphash self-test 4u32: FAIL\n");
+		ret = -EINVAL;
+	}
+	if (!ret)
+		pr_info("self-tests: pass\n");
+	return ret;
+}
+
+static void __exit siphash_test_exit(void)
+{
+}
+
+module_init(siphash_test_init);
+module_exit(siphash_test_exit);
+
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c
index 0ecef3e4690e..6f335a3d4ae2 100644
--- a/lib/test_user_copy.c
+++ b/lib/test_user_copy.c
@@ -25,6 +25,23 @@
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 
+/*
+ * Several 32-bit architectures support 64-bit {get,put}_user() calls.
+ * As there doesn't appear to be anything that can safely determine
+ * their capability at compile-time, we just have to opt-out certain archs.
+ */
+#if BITS_PER_LONG == 64 || (!defined(CONFIG_AVR32)  &&		\
+			    !defined(CONFIG_BLACKFIN) &&	\
+			    !defined(CONFIG_M32R) &&		\
+			    !defined(CONFIG_M68K) &&		\
+			    !defined(CONFIG_MICROBLAZE) &&	\
+			    !defined(CONFIG_MN10300) &&		\
+			    !defined(CONFIG_NIOS2) &&		\
+			    !defined(CONFIG_PPC32) &&		\
+			    !defined(CONFIG_SUPERH))
+# define TEST_U64
+#endif
+
 #define test(condition, msg)		\
 ({					\
 	int cond = (condition);		\
@@ -40,7 +57,12 @@ static int __init test_user_copy_init(void)
 	char __user *usermem;
 	char *bad_usermem;
 	unsigned long user_addr;
-	unsigned long value = 0x5A;
+	u8 val_u8;
+	u16 val_u16;
+	u32 val_u32;
+#ifdef TEST_U64
+	u64 val_u64;
+#endif
 
 	kmem = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
 	if (!kmem)
@@ -58,33 +80,100 @@ static int __init test_user_copy_init(void)
 	usermem = (char __user *)user_addr;
 	bad_usermem = (char *)user_addr;
 
-	/* Legitimate usage: none of these should fail. */
-	ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE),
-		    "legitimate copy_from_user failed");
+	/*
+	 * Legitimate usage: none of these copies should fail.
+	 */
+	memset(kmem, 0x3a, PAGE_SIZE * 2);
 	ret |= test(copy_to_user(usermem, kmem, PAGE_SIZE),
 		    "legitimate copy_to_user failed");
-	ret |= test(get_user(value, (unsigned long __user *)usermem),
-		    "legitimate get_user failed");
-	ret |= test(put_user(value, (unsigned long __user *)usermem),
-		    "legitimate put_user failed");
-
-	/* Invalid usage: none of these should succeed. */
+	memset(kmem, 0x0, PAGE_SIZE);
+	ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE),
+		    "legitimate copy_from_user failed");
+	ret |= test(memcmp(kmem, kmem + PAGE_SIZE, PAGE_SIZE),
+		    "legitimate usercopy failed to copy data");
+
+#define test_legit(size, check)						  \
+	do {								  \
+		val_##size = check;					  \
+		ret |= test(put_user(val_##size, (size __user *)usermem), \
+		    "legitimate put_user (" #size ") failed");		  \
+		val_##size = 0;						  \
+		ret |= test(get_user(val_##size, (size __user *)usermem), \
+		    "legitimate get_user (" #size ") failed");		  \
+		ret |= test(val_##size != check,			  \
+		    "legitimate get_user (" #size ") failed to do copy"); \
+		if (val_##size != check) {				  \
+			pr_info("0x%llx != 0x%llx\n",			  \
+				(unsigned long long)val_##size,		  \
+				(unsigned long long)check);		  \
+		}							  \
+	} while (0)
+
+	test_legit(u8,  0x5a);
+	test_legit(u16, 0x5a5b);
+	test_legit(u32, 0x5a5b5c5d);
+#ifdef TEST_U64
+	test_legit(u64, 0x5a5b5c5d6a6b6c6d);
+#endif
+#undef test_legit
+
+	/*
+	 * Invalid usage: none of these copies should succeed.
+	 */
+
+	/* Prepare kernel memory with check values. */
+	memset(kmem, 0x5a, PAGE_SIZE);
+	memset(kmem + PAGE_SIZE, 0, PAGE_SIZE);
+
+	/* Reject kernel-to-kernel copies through copy_from_user(). */
 	ret |= test(!copy_from_user(kmem, (char __user *)(kmem + PAGE_SIZE),
 				    PAGE_SIZE),
 		    "illegal all-kernel copy_from_user passed");
+
+	/* Destination half of buffer should have been zeroed. */
+	ret |= test(memcmp(kmem + PAGE_SIZE, kmem, PAGE_SIZE),
+		    "zeroing failure for illegal all-kernel copy_from_user");
+
+#if 0
+	/*
+	 * When running with SMAP/PAN/etc, this will Oops the kernel
+	 * due to the zeroing of userspace memory on failure. This needs
+	 * to be tested in LKDTM instead, since this test module does not
+	 * expect to explode.
+	 */
 	ret |= test(!copy_from_user(bad_usermem, (char __user *)kmem,
 				    PAGE_SIZE),
 		    "illegal reversed copy_from_user passed");
+#endif
 	ret |= test(!copy_to_user((char __user *)kmem, kmem + PAGE_SIZE,
 				  PAGE_SIZE),
 		    "illegal all-kernel copy_to_user passed");
 	ret |= test(!copy_to_user((char __user *)kmem, bad_usermem,
 				  PAGE_SIZE),
 		    "illegal reversed copy_to_user passed");
-	ret |= test(!get_user(value, (unsigned long __user *)kmem),
-		    "illegal get_user passed");
-	ret |= test(!put_user(value, (unsigned long __user *)kmem),
-		    "illegal put_user passed");
+
+#define test_illegal(size, check)					    \
+	do {								    \
+		val_##size = (check);					    \
+		ret |= test(!get_user(val_##size, (size __user *)kmem),	    \
+		    "illegal get_user (" #size ") passed");		    \
+		ret |= test(val_##size != (size)0,			    \
+		    "zeroing failure for illegal get_user (" #size ")");    \
+		if (val_##size != (size)0) {				    \
+			pr_info("0x%llx != 0\n",			    \
+				(unsigned long long)val_##size);	    \
+		}							    \
+		ret |= test(!put_user(val_##size, (size __user *)kmem),	    \
+		    "illegal put_user (" #size ") passed");		    \
+	} while (0)
+
+	test_illegal(u8,  0x5a);
+	test_illegal(u16, 0x5a5b);
+	test_illegal(u32, 0x5a5b5c5d);
+#ifdef TEST_U64
+	test_illegal(u64, 0x5a5b5c5d6a6b6c6d);
+#endif
+#undef test_illegal
 
 	vm_munmap(user_addr, PAGE_SIZE * 2);
 	kfree(kmem);
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 782ae8ca2c06..4a720ed4fdaf 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -48,7 +48,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	while (*p) {
 		parent = *p;
 		ptr = rb_entry(parent, struct timerqueue_node, node);
-		if (node->expires.tv64 < ptr->expires.tv64)
+		if (node->expires < ptr->expires)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -56,7 +56,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &head->head);
 
-	if (!head->next || node->expires.tv64 < head->next->expires.tv64) {
+	if (!head->next || node->expires < head->next->expires) {
 		head->next = node;
 		return true;
 	}
@@ -80,8 +80,7 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 	if (head->next == node) {
 		struct rb_node *rbn = rb_next(&node->node);
 
-		head->next = rbn ?
-			rb_entry(rbn, struct timerqueue_node, node) : NULL;
+		head->next = rb_entry_safe(rbn, struct timerqueue_node, node);
 	}
 	rb_erase(&node->node, &head->head);
 	RB_CLEAR_NODE(&node->node);
diff --git a/lib/win_minmax.c b/lib/win_minmax.c
new file mode 100644
index 000000000000..c8420d404926
--- /dev/null
+++ b/lib/win_minmax.c
@@ -0,0 +1,98 @@
+/**
+ * lib/minmax.c: windowed min/max tracker
+ *
+ * Kathleen Nichols' algorithm for tracking the minimum (or maximum)
+ * value of a data stream over some fixed time interval.  (E.g.,
+ * the minimum RTT over the past five minutes.) It uses constant
+ * space and constant time per update yet almost always delivers
+ * the same minimum as an implementation that has to keep all the
+ * data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of
+ * the n'th best >= n-1'th best. It also makes sure that the three
+ * values are widely separated in the time window since that bounds
+ * the worse case error when that data is monotonically increasing
+ * over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because
+ * it has no value - the new min is <= everything else in the window
+ * by definition and it's the most recent. So we restart fresh on
+ * every new min and overwrites 2nd & 3rd choices. The same property
+ * holds for 2nd & 3rd best.
+ */
+#include <linux/module.h>
+#include <linux/win_minmax.h>
+
+/* As time advances, update the 1st, 2nd, and 3rd choices. */
+static u32 minmax_subwin_update(struct minmax *m, u32 win,
+				const struct minmax_sample *val)
+{
+	u32 dt = val->t - m->s[0].t;
+
+	if (unlikely(dt > win)) {
+		/*
+		 * Passed entire window without a new val so make 2nd
+		 * choice the new val & 3rd choice the new 2nd choice.
+		 * we may have to iterate this since our 2nd choice
+		 * may also be outside the window (we checked on entry
+		 * that the third choice was in the window).
+		 */
+		m->s[0] = m->s[1];
+		m->s[1] = m->s[2];
+		m->s[2] = *val;
+		if (unlikely(val->t - m->s[0].t > win)) {
+			m->s[0] = m->s[1];
+			m->s[1] = m->s[2];
+			m->s[2] = *val;
+		}
+	} else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) {
+		/*
+		 * We've passed a quarter of the window without a new val
+		 * so take a 2nd choice from the 2nd quarter of the window.
+		 */
+		m->s[2] = m->s[1] = *val;
+	} else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) {
+		/*
+		 * We've passed half the window without finding a new val
+		 * so take a 3rd choice from the last half of the window
+		 */
+		m->s[2] = *val;
+	}
+	return m->s[0].v;
+}
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */
+u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+	struct minmax_sample val = { .t = t, .v = meas };
+
+	if (unlikely(val.v >= m->s[0].v) ||	  /* found new max? */
+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
+		return minmax_reset(m, t, meas);  /* forget earlier samples */
+
+	if (unlikely(val.v >= m->s[1].v))
+		m->s[2] = m->s[1] = val;
+	else if (unlikely(val.v >= m->s[2].v))
+		m->s[2] = val;
+
+	return minmax_subwin_update(m, win, &val);
+}
+EXPORT_SYMBOL(minmax_running_max);
+
+/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */
+u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas)
+{
+	struct minmax_sample val = { .t = t, .v = meas };
+
+	if (unlikely(val.v <= m->s[0].v) ||	  /* found new min? */
+	    unlikely(val.t - m->s[2].t > win))	  /* nothing left in window? */
+		return minmax_reset(m, t, meas);  /* forget earlier samples */
+
+	if (unlikely(val.v <= m->s[1].v))
+		m->s[2] = m->s[1] = val;
+	else if (unlikely(val.v <= m->s[2].v))
+		m->s[2] = val;
+
+	return minmax_subwin_update(m, win, &val);
+}