diff options
Diffstat (limited to 'lib')
54 files changed, 5946 insertions, 1186 deletions
diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..5d644f180fe5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -457,9 +457,6 @@ config NLATTR config GENERIC_ATOMIC64 bool -config ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - def_bool y if GENERIC_ATOMIC64 - config LRU_CACHE tristate @@ -550,4 +547,10 @@ config STACKDEPOT bool select STACKTRACE +config SBITMAP + bool + +config PARMAN + tristate "parman" + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cab7405f48d2..66fb4389f05c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -13,7 +13,22 @@ config PRINTK_TIME be included, not that the timestamp is recorded. The behavior is also controlled by the kernel command line - parameter printk.time=1. See Documentation/kernel-parameters.txt + parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst + +config CONSOLE_LOGLEVEL_DEFAULT + int "Default console loglevel (1-15)" + range 1 15 + default "7" + help + Default loglevel to determine what will be printed on the console. + + Setting a default here is equivalent to passing in loglevel=<x> in + the kernel bootargs. loglevel=<x> continues to override whatever + value is specified here as well. + + Note: This does not affect the log level of un-prefixed printk() + usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT + option. config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" @@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT that are auditing their logs closely may want to set it to a lower priority. + Note: This does not affect what message level gets printed on the console + by default. To change that, use loglevel=<x> in the kernel bootargs, + or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value. + config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY @@ -145,7 +164,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO + depends on DEBUG_INFO && !FRV help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, @@ -175,8 +194,8 @@ config GDB_SCRIPTS build directory. If you load vmlinux into gdb, the helper scripts will be automatically imported by gdb as well, and additional functions are available to analyze a Linux kernel - instance. See Documentation/gdb-kernel-debugging.txt for further - details. + instance. See Documentation/dev-tools/gdb-kernel-debugging.rst + for further details. config ENABLE_WARN_DEPRECATED bool "Enable __deprecated logic" @@ -198,6 +217,7 @@ config FRAME_WARN int "Warn for stack frames larger than (needs gcc 4.4)" range 0 8192 default 0 if KASAN + default 2048 if GCC_PLUGIN_LATENT_ENTROPY default 1024 if !64BIT default 2048 if 64BIT help @@ -305,7 +325,7 @@ config DEBUG_SECTION_MISMATCH a larger kernel). - Run the section mismatch analysis for each module/built-in.o file. When we run the section mismatch analysis on vmlinux.o, we - lose valueble information about where the mismatch was + lose valuable information about where the mismatch was introduced. Running the analysis for each module/built-in.o file tells where the mismatch happens much closer to the @@ -396,6 +416,16 @@ config MAGIC_SYSRQ_DEFAULT_ENABLE This may be set to 1 or 0 to enable or disable them all, or to a bitmask as described in Documentation/sysrq.txt. +config MAGIC_SYSRQ_SERIAL + bool "Enable magic SysRq key over serial" + depends on MAGIC_SYSRQ + default y + help + Many embedded boards have a disconnected TTL level serial which can + generate some garbage that can lead to spurious false sysrq detects. + This option allows you to decide whether you want to enable the + magic SysRq key. + config DEBUG_KERNEL bool "Kernel debugging" help @@ -522,7 +552,7 @@ config DEBUG_KMEMLEAK difference being that the orphan objects are not freed but only shown in /sys/kernel/debug/kmemleak. Enabling this feature will introduce an overhead to memory - allocations. See Documentation/kmemleak.txt for more + allocations. See Documentation/dev-tools/kmemleak.rst for more details. Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances @@ -602,9 +632,12 @@ config DEBUG_VM_PGFLAGS If unsure, say N. +config ARCH_HAS_DEBUG_VIRTUAL + bool + config DEBUG_VIRTUAL bool "Debug VM translations" - depends on DEBUG_KERNEL && X86 + depends on DEBUG_KERNEL && ARCH_HAS_DEBUG_VIRTUAL help Enable some costly sanity checks in virtual to page code. This can catch mistakes with virt_to_page() and friends. @@ -696,6 +729,19 @@ source "lib/Kconfig.kmemcheck" source "lib/Kconfig.kasan" +config DEBUG_REFCOUNT + bool "Verbose refcount checks" + help + Say Y here if you want reference counters (refcount_t and kref) to + generate WARNs on dubious usage. Without this refcount_t will still + be a saturating counter and avoid Use-After-Free by turning it into + a resource leak Denial-Of-Service. + + Use of this option will increase kernel text size but will alert the + admin of potential abuse. + + If in doubt, say "N". + endmenu # "Memory Debugging" config ARCH_HAS_KCOV @@ -719,7 +765,7 @@ config KCOV different machines and across reboots. If you need stable PC values, disable RANDOMIZE_BASE. - For more details, see Documentation/kcov.txt. + For more details, see Documentation/dev-tools/kcov.rst. config KCOV_INSTRUMENT_ALL bool "Instrument all code by default" @@ -960,20 +1006,6 @@ config DEBUG_TIMEKEEPING If unsure, say N. -config TIMER_STATS - bool "Collect kernel timers statistics" - depends on DEBUG_KERNEL && PROC_FS - help - If you say Y here, additional code will be inserted into the - timer routines to collect statistics about kernel timers being - reprogrammed. The statistics can be read from /proc/timer_stats. - The statistics collection is started by writing 1 to /proc/timer_stats, - writing 0 stops it. This feature is useful to collect information - about timer usage patterns in kernel and userspace. This feature - is lightweight if enabled in the kernel config but not activated - (it defaults to deactivated on bootup and will only be activated - if some application like powertop activates it explicitly). - config DEBUG_PREEMPT bool "Debug preemptible kernel" depends on DEBUG_KERNEL && PREEMPT && TRACE_IRQFLAGS_SUPPORT @@ -1084,6 +1116,9 @@ config PROVE_LOCKING For more details, see Documentation/locking/lockdep-design.txt. +config PROVE_LOCKING_SMALL + bool + config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -1157,6 +1192,18 @@ config LOCK_TORTURE_TEST Say M if you want these torture tests to build as a module. Say N if you are unsure. +config WW_MUTEX_SELFTEST + tristate "Wait/wound mutex selftests" + help + This option provides a kernel module that runs tests on the + on the struct ww_mutex locking API. + + It is recommended to enable DEBUG_WW_MUTEX_SLOWPATH in conjunction + with this test harness. + + Say M if you want these self tests to build as a module. + Say N if you are unsure. + endmenu # lock debugging config TRACE_IRQFLAGS @@ -1214,7 +1261,7 @@ config DEBUG_BUGVERBOSE config DEBUG_LIST bool "Debug linked list manipulation" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION help Enable this to turn on extended checks in the linked-list walking routines. @@ -1427,10 +1474,12 @@ config RCU_CPU_STALL_TIMEOUT config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL + default y if TREE_RCU select TRACE_CLOCK help This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. + in debugfs for debugging RCU implementation. It also enables + additional tracepoints for ftrace-style event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. @@ -1514,30 +1563,6 @@ config NOTIFIER_ERROR_INJECTION Say N if unsure. -config CPU_NOTIFIER_ERROR_INJECT - tristate "CPU notifier error injection module" - depends on HOTPLUG_CPU && NOTIFIER_ERROR_INJECTION - help - This option provides a kernel module that can be used to test - the error handling of the cpu notifiers by injecting artificial - errors to CPU notifier chain callbacks. It is controlled through - debugfs interface under /sys/kernel/debug/notifier-error-inject/cpu - - If the notifier call chain should be failed with some events - notified, write the error code to "actions/<notifier event>/error". - - Example: Inject CPU offline error (-1 == -EPERM) - - # cd /sys/kernel/debug/notifier-error-inject/cpu - # echo -1 > actions/CPU_DOWN_PREPARE/error - # echo 0 > /sys/devices/system/cpu/cpu1/online - bash: echo: write error: Operation not permitted - - To compile this code as a module, choose M here: the module will - be called cpu-notifier-error-inject. - - If unsure, say N. - config PM_NOTIFIER_ERROR_INJECT tristate "PM notifier error injection module" depends on PM && NOTIFIER_ERROR_INJECTION @@ -1819,13 +1844,23 @@ config TEST_HASH tristate "Perform selftest on hash functions" default n help - Enable this option to test the kernel's integer (<linux/hash,h>) - and string (<linux/stringhash.h>) hash functions on boot - (or module load). + Enable this option to test the kernel's integer (<linux/hash.h>), + string (<linux/stringhash.h>), and siphash (<linux/siphash.h>) + hash functions on boot (or module load). This is intended to help people writing architecture-specific optimized versions. If unsure, say N. +config TEST_PARMAN + tristate "Perform selftest on priority array manager" + default n + depends on PARMAN + help + Enable this option to test priority array manager on boot + (or module load). + + If unsure, say N. + endmenu # runtime tests config PROVIDE_OHCI1394_DMA_INIT @@ -1857,15 +1892,6 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. -config BUILD_DOCSRC - bool "Build targets in Documentation/ tree" - depends on HEADERS_CHECK - help - This option attempts to build objects from the source files in the - kernel Documentation/ tree. - - Say N if you are unsure. - config DMA_API_DEBUG bool "Enable debugging of DMA-API usage" depends on HAVE_DMA_API_DEBUG @@ -1969,6 +1995,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" @@ -1980,7 +2016,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED config STRICT_DEVMEM bool "Filter access to /dev/mem" - depends on MMU + depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED default y if TILE || PPC ---help--- diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bc6e651df68c..a669c193b878 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -10,7 +10,8 @@ config UBSAN This option enables undefined behaviour sanity checker Compile-time instrumentation is used to detect various undefined behaviours in runtime. Various types of checks may be enabled - via boot parameter ubsan_handle (see: Documentation/ubsan.txt). + via boot parameter ubsan_handle + (see: Documentation/dev-tools/ubsan.rst). config UBSAN_SANITIZE_ALL bool "Enable instrumentation for the entire kernel" diff --git a/lib/Makefile b/lib/Makefile index 5dc77a8ec297..6b768b58a38d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,7 +22,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o + earlycpio.o seq_buf.o siphash.o \ + nmi_backtrace.o nodemask.o win_minmax.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o @@ -31,7 +32,7 @@ lib-$(CONFIG_HAS_DMA) += dma-noop.o lib-y += kobject.o klist.o obj-y += lockref.o -obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ +obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ @@ -44,7 +45,7 @@ obj-$(CONFIG_TEST_HEXDUMP) += test_hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_BPF) += test_bpf.o obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o -obj-$(CONFIG_TEST_HASH) += test_hash.o +obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o @@ -55,6 +56,7 @@ obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_UUID) += test_uuid.o +obj-$(CONFIG_TEST_PARMAN) += test_parman.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG @@ -128,7 +130,6 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o -obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o @@ -180,6 +181,7 @@ obj-$(CONFIG_IRQ_POLL) += irq_poll.o obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +KCOV_INSTRUMENT_stackdepot.o := n libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o @@ -227,3 +229,7 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n + +obj-$(CONFIG_SBITMAP) += sbitmap.o + +obj-$(CONFIG_PARMAN) += parman.o diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index dbb369145dda..46042901130f 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -213,7 +213,6 @@ static __init void test_atomic64(void) r += one; BUG_ON(v.counter != r); -#ifdef CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE INIT(onestwos); BUG_ON(atomic64_dec_if_positive(&v) != (onestwos - 1)); r -= one; @@ -226,9 +225,6 @@ static __init void test_atomic64(void) INIT(-one); BUG_ON(atomic64_dec_if_positive(&v) != (-one - one)); BUG_ON(v.counter != r); -#else -#warning Please implement atomic64_dec_if_positive for your architecture and select the above Kconfig symbol -#endif INIT(onestwos); BUG_ON(!atomic64_inc_not_zero(&v)); diff --git a/lib/bitmap.c b/lib/bitmap.c index eca88087fa8a..0b66f0e5eb6b 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -496,6 +496,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * ranges. Consecutively set bits are shown as two hyphen-separated * decimal numbers, the smallest and largest bit numbers set in * the range. + * Optionally each range can be postfixed to denote that only parts of it + * should be set. The range will divided to groups of specific size. + * From each group will be used only defined amount of bits. + * Syntax: range:used_size/group_size + * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * * Returns 0 on success, -errno on invalid input strings. * Error values: @@ -507,16 +512,20 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, unsigned long *maskp, int nmaskbits) { - unsigned a, b; + unsigned int a, b, old_a, old_b; + unsigned int group_size, used_size; int c, old_c, totaldigits, ndigits; const char __user __force *ubuf = (const char __user __force *)buf; - int at_start, in_range; + int at_start, in_range, in_partial_range; totaldigits = c = 0; + old_a = old_b = 0; + group_size = used_size = 0; bitmap_zero(maskp, nmaskbits); do { at_start = 1; in_range = 0; + in_partial_range = 0; a = b = 0; ndigits = totaldigits; @@ -547,6 +556,24 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, if ((totaldigits != ndigits) && isspace(old_c)) return -EINVAL; + if (c == '/') { + used_size = a; + at_start = 1; + in_range = 0; + a = b = 0; + continue; + } + + if (c == ':') { + old_a = a; + old_b = b; + at_start = 1; + in_range = 0; + in_partial_range = 1; + a = b = 0; + continue; + } + if (c == '-') { if (at_start || in_range) return -EINVAL; @@ -567,15 +594,30 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, } if (ndigits == totaldigits) continue; + if (in_partial_range) { + group_size = a; + a = old_a; + b = old_b; + old_a = old_b = 0; + } /* if no digit is after '-', it's wrong*/ if (at_start && in_range) return -EINVAL; - if (!(a <= b)) + if (!(a <= b) || !(used_size <= group_size)) return -EINVAL; if (b >= nmaskbits) return -ERANGE; while (a <= b) { - set_bit(a, maskp); + if (in_partial_range) { + static int pos_in_group = 1; + + if (pos_in_group <= used_size) + set_bit(a, maskp); + + if (a == b || ++pos_in_group > group_size) + pos_in_group = 1; + } else + set_bit(a, maskp); a++; } } while (buflen && c == ','); diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c deleted file mode 100644 index 0e2c9a1e958a..000000000000 --- a/lib/cpu-notifier-error-inject.c +++ /dev/null @@ -1,84 +0,0 @@ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/cpu.h> - -#include "notifier-error-inject.h" - -static int priority; -module_param(priority, int, 0); -MODULE_PARM_DESC(priority, "specify cpu notifier priority"); - -#define UP_PREPARE 0 -#define UP_PREPARE_FROZEN 0 -#define DOWN_PREPARE 0 -#define DOWN_PREPARE_FROZEN 0 - -static struct notifier_err_inject cpu_notifier_err_inject = { - .actions = { - { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, - {} - } -}; - -static int notf_err_handle(struct notifier_err_inject_action *action) -{ - int ret; - - ret = action->error; - if (ret) - pr_info("Injecting error (%d) to %s\n", ret, action->name); - return ret; -} - -static int notf_err_inj_up_prepare(unsigned int cpu) -{ - if (!cpuhp_tasks_frozen) - return notf_err_handle(&cpu_notifier_err_inject.actions[0]); - else - return notf_err_handle(&cpu_notifier_err_inject.actions[1]); -} - -static int notf_err_inj_dead(unsigned int cpu) -{ - if (!cpuhp_tasks_frozen) - return notf_err_handle(&cpu_notifier_err_inject.actions[2]); - else - return notf_err_handle(&cpu_notifier_err_inject.actions[3]); -} - -static struct dentry *dir; - -static int err_inject_init(void) -{ - int err; - - dir = notifier_err_inject_init("cpu", notifier_err_inject_dir, - &cpu_notifier_err_inject, priority); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, - "cpu-err-notif:prepare", - notf_err_inj_up_prepare, - notf_err_inj_dead); - if (err) - debugfs_remove_recursive(dir); - - return err; -} - -static void err_inject_exit(void) -{ - cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); - debugfs_remove_recursive(dir); -} - -module_init(err_inject_init); -module_exit(err_inject_exit); - -MODULE_DESCRIPTION("CPU notifier error injection module"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>"); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a8e12601eb37..8c28cbd7e104 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -52,9 +52,18 @@ static int debug_objects_fixups __read_mostly; static int debug_objects_warnings __read_mostly; static int debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; - +static int debug_objects_pool_size __read_mostly + = ODEBUG_POOL_SIZE; +static int debug_objects_pool_min_level __read_mostly + = ODEBUG_POOL_MIN_LEVEL; static struct debug_obj_descr *descr_test __read_mostly; +/* + * Track numbers of kmem_cache_alloc()/free() calls done. + */ +static int debug_objects_allocated; +static int debug_objects_freed; + static void free_obj_work(struct work_struct *work); static DECLARE_WORK(debug_obj_work, free_obj_work); @@ -88,13 +97,13 @@ static void fill_pool(void) struct debug_obj *new; unsigned long flags; - if (likely(obj_pool_free >= ODEBUG_POOL_MIN_LEVEL)) + if (likely(obj_pool_free >= debug_objects_pool_min_level)) return; if (unlikely(!obj_cache)) return; - while (obj_pool_free < ODEBUG_POOL_MIN_LEVEL) { + while (obj_pool_free < debug_objects_pool_min_level) { new = kmem_cache_zalloc(obj_cache, gfp); if (!new) @@ -102,6 +111,7 @@ static void fill_pool(void) raw_spin_lock_irqsave(&pool_lock, flags); hlist_add_head(&new->node, &obj_pool); + debug_objects_allocated++; obj_pool_free++; raw_spin_unlock_irqrestore(&pool_lock, flags); } @@ -162,24 +172,39 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) /* * workqueue function to free objects. + * + * To reduce contention on the global pool_lock, the actual freeing of + * debug objects will be delayed if the pool_lock is busy. We also free + * the objects in a batch of 4 for each lock/unlock cycle. */ +#define ODEBUG_FREE_BATCH 4 + static void free_obj_work(struct work_struct *work) { - struct debug_obj *obj; + struct debug_obj *objs[ODEBUG_FREE_BATCH]; unsigned long flags; + int i; - raw_spin_lock_irqsave(&pool_lock, flags); - while (obj_pool_free > ODEBUG_POOL_SIZE) { - obj = hlist_entry(obj_pool.first, typeof(*obj), node); - hlist_del(&obj->node); - obj_pool_free--; + if (!raw_spin_trylock_irqsave(&pool_lock, flags)) + return; + while (obj_pool_free >= debug_objects_pool_size + ODEBUG_FREE_BATCH) { + for (i = 0; i < ODEBUG_FREE_BATCH; i++) { + objs[i] = hlist_entry(obj_pool.first, + typeof(*objs[0]), node); + hlist_del(&objs[i]->node); + } + + obj_pool_free -= ODEBUG_FREE_BATCH; + debug_objects_freed += ODEBUG_FREE_BATCH; /* * We release pool_lock across kmem_cache_free() to * avoid contention on pool_lock. */ raw_spin_unlock_irqrestore(&pool_lock, flags); - kmem_cache_free(obj_cache, obj); - raw_spin_lock_irqsave(&pool_lock, flags); + for (i = 0; i < ODEBUG_FREE_BATCH; i++) + kmem_cache_free(obj_cache, objs[i]); + if (!raw_spin_trylock_irqsave(&pool_lock, flags)) + return; } raw_spin_unlock_irqrestore(&pool_lock, flags); } @@ -198,8 +223,8 @@ static void free_object(struct debug_obj *obj) * schedule work when the pool is filled and the cache is * initialized: */ - if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up(); + if (obj_pool_free > debug_objects_pool_size && obj_cache) + sched = 1; hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; @@ -362,6 +387,7 @@ void debug_object_init(void *addr, struct debug_obj_descr *descr) __debug_object_init(addr, descr, 0); } +EXPORT_SYMBOL_GPL(debug_object_init); /** * debug_object_init_on_stack - debug checks when an object on stack is @@ -376,6 +402,7 @@ void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr) __debug_object_init(addr, descr, 1); } +EXPORT_SYMBOL_GPL(debug_object_init_on_stack); /** * debug_object_activate - debug checks when an object is activated @@ -449,6 +476,7 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr) } return 0; } +EXPORT_SYMBOL_GPL(debug_object_activate); /** * debug_object_deactivate - debug checks when an object is deactivated @@ -496,6 +524,7 @@ void debug_object_deactivate(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_deactivate); /** * debug_object_destroy - debug checks when an object is destroyed @@ -542,6 +571,7 @@ void debug_object_destroy(void *addr, struct debug_obj_descr *descr) out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_destroy); /** * debug_object_free - debug checks when an object is freed @@ -582,6 +612,7 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) out_unlock: raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_free); /** * debug_object_assert_init - debug checks when object should be init-ed @@ -626,6 +657,7 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_assert_init); /** * debug_object_active_state - debug checks object usage state machine @@ -673,6 +705,7 @@ debug_object_active_state(void *addr, struct debug_obj_descr *descr, raw_spin_unlock_irqrestore(&db->lock, flags); } +EXPORT_SYMBOL_GPL(debug_object_active_state); #ifdef CONFIG_DEBUG_OBJECTS_FREE static void __debug_check_no_obj_freed(const void *address, unsigned long size) @@ -750,6 +783,8 @@ static int debug_stats_show(struct seq_file *m, void *v) seq_printf(m, "pool_min_free :%d\n", obj_pool_min_free); seq_printf(m, "pool_used :%d\n", obj_pool_used); seq_printf(m, "pool_max_used :%d\n", obj_pool_max_used); + seq_printf(m, "objs_allocated:%d\n", debug_objects_allocated); + seq_printf(m, "objs_freed :%d\n", debug_objects_freed); return 0; } @@ -1108,4 +1143,11 @@ void __init debug_objects_mem_init(void) pr_warn("out of memory.\n"); } else debug_objects_selftest(); + + /* + * Increase the thresholds for allocating and freeing objects + * according to the number of possible CPUs available in the system. + */ + debug_objects_pool_size += num_possible_cpus() * 32; + debug_objects_pool_min_level += num_possible_cpus() * 4; } diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 06f02f6aecd2..60c57ec936db 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -44,6 +44,7 @@ enum { dma_debug_page, dma_debug_sg, dma_debug_coherent, + dma_debug_resource, }; enum map_err_types { @@ -151,8 +152,9 @@ static const char *const maperr2str[] = { [MAP_ERR_CHECKED] = "dma map error checked", }; -static const char *type2name[4] = { "single", "page", - "scather-gather", "coherent" }; +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", "DMA_FROM_DEVICE", "DMA_NONE" }; @@ -400,6 +402,9 @@ static void hash_bucket_del(struct dma_debug_entry *entry) static unsigned long long phys_addr(struct dma_debug_entry *entry) { + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; } @@ -1150,6 +1155,11 @@ static void check_unmap(struct dma_debug_entry *ref) dir2name[ref->direction]); } + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { err_printk(ref->dev, entry, "DMA-API: device driver failed to check map error" @@ -1519,6 +1529,49 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } EXPORT_SYMBOL(debug_dma_free_coherent); +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/lib/extable.c b/lib/extable.c index 0be02ad561e9..62968daa66a9 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -12,7 +12,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sort.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) diff --git a/lib/genalloc.c b/lib/genalloc.c index 0a1139644d32..144fe6b1a03e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -292,7 +292,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, struct gen_pool_chunk *chunk; unsigned long addr = 0; int order = pool->min_alloc_order; - int nbits, start_bit = 0, end_bit, remain; + int nbits, start_bit, end_bit, remain; #ifndef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG BUG_ON(in_nmi()); @@ -307,6 +307,7 @@ unsigned long gen_pool_alloc_algo(struct gen_pool *pool, size_t size, if (size > atomic_read(&chunk->avail)) continue; + start_bit = 0; end_bit = chunk_size(chunk) >> order; retry: start_bit = algo(chunk->bits, end_bit, start_bit, diff --git a/lib/halfmd4.c b/lib/halfmd4.c deleted file mode 100644 index 137e861d9690..000000000000 --- a/lib/halfmd4.c +++ /dev/null @@ -1,67 +0,0 @@ -#include <linux/compiler.h> -#include <linux/export.h> -#include <linux/cryptohash.h> -#include <linux/bitops.h> - -/* F, G and H are basic MD4 functions: selection, majority, parity */ -#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) -#define H(x, y, z) ((x) ^ (y) ^ (z)) - -/* - * The generic round function. The application is so specific that - * we don't bother protecting all the arguments with parens, as is generally - * good macro practice, in favor of extra legibility. - * Rotation is separate from addition to prevent recomputation - */ -#define ROUND(f, a, b, c, d, x, s) \ - (a += f(b, c, d) + x, a = rol32(a, s)) -#define K1 0 -#define K2 013240474631UL -#define K3 015666365641UL - -/* - * Basic cut-down MD4 transform. Returns only 32 bits of result. - */ -__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]) -{ - __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; - - /* Round 1 */ - ROUND(F, a, b, c, d, in[0] + K1, 3); - ROUND(F, d, a, b, c, in[1] + K1, 7); - ROUND(F, c, d, a, b, in[2] + K1, 11); - ROUND(F, b, c, d, a, in[3] + K1, 19); - ROUND(F, a, b, c, d, in[4] + K1, 3); - ROUND(F, d, a, b, c, in[5] + K1, 7); - ROUND(F, c, d, a, b, in[6] + K1, 11); - ROUND(F, b, c, d, a, in[7] + K1, 19); - - /* Round 2 */ - ROUND(G, a, b, c, d, in[1] + K2, 3); - ROUND(G, d, a, b, c, in[3] + K2, 5); - ROUND(G, c, d, a, b, in[5] + K2, 9); - ROUND(G, b, c, d, a, in[7] + K2, 13); - ROUND(G, a, b, c, d, in[0] + K2, 3); - ROUND(G, d, a, b, c, in[2] + K2, 5); - ROUND(G, c, d, a, b, in[4] + K2, 9); - ROUND(G, b, c, d, a, in[6] + K2, 13); - - /* Round 3 */ - ROUND(H, a, b, c, d, in[3] + K3, 3); - ROUND(H, d, a, b, c, in[7] + K3, 9); - ROUND(H, c, d, a, b, in[2] + K3, 11); - ROUND(H, b, c, d, a, in[6] + K3, 15); - ROUND(H, a, b, c, d, in[1] + K3, 3); - ROUND(H, d, a, b, c, in[5] + K3, 9); - ROUND(H, c, d, a, b, in[0] + K3, 11); - ROUND(H, b, c, d, a, in[4] + K3, 15); - - buf[0] += a; - buf[1] += b; - buf[2] += c; - buf[3] += d; - - return buf[1]; /* "most hashed" word */ -} -EXPORT_SYMBOL(half_md4_transform); diff --git a/lib/idr.c b/lib/idr.c index 6098336df267..52d2979a05e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get); * and go back to the ida_pre_get() call. If the ida is full, it will * return %-ENOSPC. * + * Note that callers must ensure that concurrent access to @ida is not possible. + * See ida_simple_get() for a varaint which takes care of locking. + * * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) @@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy); * Allocates an id in the range start <= id < end, or returns -ENOSPC. * On memory allocation failure, returns -ENOMEM. * + * Compared to ida_get_new_above() this function does its own locking, and + * should be used unless there are special requirements. + * * Use ida_simple_remove() to get rid of an id. */ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, @@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get); * ida_simple_remove - remove an allocated id. * @ida: the (initialized) ida. * @id: the id returned by ida_simple_get. + * + * Use to release an id allocated with ida_simple_get(). + * + * Compared to ida_remove() this function does its own locking, and should be + * used unless there are special requirements. */ void ida_simple_remove(struct ida *ida, unsigned int id) { diff --git a/lib/ioremap.c b/lib/ioremap.c index 86c8911b0e3a..a3e14ce92a56 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr, return err; } -EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 7e3138cfc8c9..e68604ae3ced 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1,10 +1,14 @@ #include <linux/export.h> +#include <linux/bvec.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/splice.h> #include <net/checksum.h> +#define PIPE_PARANOIA /* for now */ + #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ size_t left; \ size_t wanted = n; \ @@ -69,19 +73,21 @@ } #define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - struct bio_vec v; \ - struct bvec_iter __bi; \ - iterate_bvec(i, n, v, __bi, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ + if (likely(n)) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + struct bio_vec v; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ } \ } @@ -290,6 +296,93 @@ done: return wanted - bytes; } +#ifdef PIPE_PARANOIA +static bool sanity(const struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + int idx = i->idx; + int next = pipe->curbuf + pipe->nrbufs; + if (i->iov_offset) { + struct pipe_buffer *p; + if (unlikely(!pipe->nrbufs)) + goto Bad; // pipe must be non-empty + if (unlikely(idx != ((next - 1) & (pipe->buffers - 1)))) + goto Bad; // must be at the last buffer... + + p = &pipe->bufs[idx]; + if (unlikely(p->offset + p->len != i->iov_offset)) + goto Bad; // ... at the end of segment + } else { + if (idx != (next & (pipe->buffers - 1))) + goto Bad; // must be right after the last buffer + } + return true; +Bad: + printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset); + printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n", + pipe->curbuf, pipe->nrbufs, pipe->buffers); + for (idx = 0; idx < pipe->buffers; idx++) + printk(KERN_ERR "[%p %p %d %d]\n", + pipe->bufs[idx].ops, + pipe->bufs[idx].page, + pipe->bufs[idx].offset, + pipe->bufs[idx].len); + WARN_ON(1); + return false; +} +#else +#define sanity(i) true +#endif + +static inline int next_idx(int idx, struct pipe_inode_info *pipe) +{ + return (idx + 1) & (pipe->buffers - 1); +} + +static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + struct pipe_buffer *buf; + size_t off; + int idx; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + if (!sanity(i)) + return 0; + + off = i->iov_offset; + idx = i->idx; + buf = &pipe->bufs[idx]; + if (off) { + if (offset == off && buf->page == page) { + /* merge with the last one */ + buf->len += bytes; + i->iov_offset += bytes; + goto out; + } + idx = next_idx(idx, pipe); + buf = &pipe->bufs[idx]; + } + if (idx == pipe->curbuf && pipe->nrbufs) + return 0; + pipe->nrbufs++; + buf->ops = &page_cache_pipe_buf_ops; + get_page(buf->page = page); + buf->offset = offset; + buf->len = bytes; + i->iov_offset = offset + bytes; + i->idx = idx; +out: + i->count -= bytes; + return bytes; +} + /* * Fault in one or more iovecs of the given iov_iter, to a maximum length of * bytes. For each iovec, fault in each page that constitutes the iovec. @@ -306,8 +399,7 @@ int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) if (!(i->type & (ITER_BVEC|ITER_KVEC))) { iterate_iovec(i, bytes, v, iov, skip, ({ - err = fault_in_multipages_readable(v.iov_base, - v.iov_len); + err = fault_in_pages_readable(v.iov_base, v.iov_len); if (unlikely(err)) return err; 0;})) @@ -356,9 +448,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len) kunmap_atomic(addr); } +static inline bool allocated(struct pipe_buffer *buf) +{ + return buf->ops == &default_pipe_buf_ops; +} + +static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp) +{ + size_t off = i->iov_offset; + int idx = i->idx; + if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) { + idx = next_idx(idx, i->pipe); + off = 0; + } + *idxp = idx; + *offp = off; +} + +static size_t push_pipe(struct iov_iter *i, size_t size, + int *idxp, size_t *offp) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + ssize_t left; + + if (unlikely(size > i->count)) + size = i->count; + if (unlikely(!size)) + return 0; + + left = size; + data_start(i, &idx, &off); + *idxp = idx; + *offp = off; + if (off) { + left -= PAGE_SIZE - off; + if (left <= 0) { + pipe->bufs[idx].len += size; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + idx = next_idx(idx, pipe); + } + while (idx != pipe->curbuf || !pipe->nrbufs) { + struct page *page = alloc_page(GFP_USER); + if (!page) + break; + pipe->nrbufs++; + pipe->bufs[idx].ops = &default_pipe_buf_ops; + pipe->bufs[idx].page = page; + pipe->bufs[idx].offset = 0; + if (left <= PAGE_SIZE) { + pipe->bufs[idx].len = left; + return size; + } + pipe->bufs[idx].len = PAGE_SIZE; + left -= PAGE_SIZE; + idx = next_idx(idx, pipe); + } + return size - left; +} + +static size_t copy_pipe_to_iter(const void *addr, size_t bytes, + struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + addr += chunk; + } + i->count -= bytes; + return bytes; +} + size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; + if (unlikely(i->type & ITER_PIPE)) + return copy_pipe_to_iter(addr, bytes, i); iterate_and_advance(i, bytes, v, __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), @@ -374,6 +555,10 @@ EXPORT_SYMBOL(copy_to_iter); size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -386,9 +571,38 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter); +bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full); + size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, __copy_from_user_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -401,6 +615,30 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter_nocache); +bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full_nocache); + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -409,14 +647,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; - } else + } else if (likely(!(i->type & ITER_PIPE))) return copy_page_to_iter_iovec(page, offset, bytes, i); + else + return copy_page_to_iter_pipe(page, offset, bytes, i); } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_from_iter(kaddr + offset, bytes, i); @@ -427,8 +671,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, } EXPORT_SYMBOL(copy_page_from_iter); +static size_t pipe_zero(size_t bytes, struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + size_t n, off; + int idx; + + if (!sanity(i)) + return 0; + + bytes = n = push_pipe(i, bytes, &idx, &off); + if (unlikely(!n)) + return 0; + + for ( ; n; idx = next_idx(idx, pipe), off = 0) { + size_t chunk = min_t(size_t, n, PAGE_SIZE - off); + memzero_page(pipe->bufs[idx].page, off, chunk); + i->idx = idx; + i->iov_offset = off + chunk; + n -= chunk; + } + i->count -= bytes; + return bytes; +} + size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return pipe_zero(bytes, i); iterate_and_advance(i, bytes, v, __clear_user(v.iov_base, v.iov_len), memzero_page(v.bv_page, v.bv_offset, v.bv_len), @@ -443,6 +713,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes) { char *kaddr = kmap_atomic(page), *p = kaddr + offset; + if (unlikely(i->type & ITER_PIPE)) { + kunmap_atomic(kaddr); + WARN_ON(1); + return 0; + } iterate_all_kinds(i, bytes, v, __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), @@ -455,8 +730,58 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); +static inline void pipe_truncate(struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + if (pipe->nrbufs) { + size_t off = i->iov_offset; + int idx = i->idx; + int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1); + if (off) { + pipe->bufs[idx].len = off - pipe->bufs[idx].offset; + idx = next_idx(idx, pipe); + nrbufs++; + } + while (pipe->nrbufs > nrbufs) { + pipe_buf_release(pipe, &pipe->bufs[idx]); + idx = next_idx(idx, pipe); + pipe->nrbufs--; + } + } +} + +static void pipe_advance(struct iov_iter *i, size_t size) +{ + struct pipe_inode_info *pipe = i->pipe; + if (unlikely(i->count < size)) + size = i->count; + if (size) { + struct pipe_buffer *buf; + size_t off = i->iov_offset, left = size; + int idx = i->idx; + if (off) /* make it relative to the beginning of buffer */ + left += off - pipe->bufs[idx].offset; + while (1) { + buf = &pipe->bufs[idx]; + if (left <= buf->len) + break; + left -= buf->len; + idx = next_idx(idx, pipe); + } + i->idx = idx; + i->iov_offset = buf->offset + left; + } + i->count -= size; + /* ... and discard everything past that point */ + pipe_truncate(i); +} + void iov_iter_advance(struct iov_iter *i, size_t size) { + if (unlikely(i->type & ITER_PIPE)) { + pipe_advance(i, size); + return; + } iterate_and_advance(i, size, v, 0, 0, 0) } EXPORT_SYMBOL(iov_iter_advance); @@ -466,6 +791,8 @@ EXPORT_SYMBOL(iov_iter_advance); */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { + if (unlikely(i->type & ITER_PIPE)) + return i->count; // it is a silly place, anyway if (i->nr_segs == 1) return i->count; else if (i->type & ITER_BVEC) @@ -501,14 +828,30 @@ void iov_iter_bvec(struct iov_iter *i, int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_pipe(struct iov_iter *i, int direction, + struct pipe_inode_info *pipe, + size_t count) +{ + BUG_ON(direction != ITER_PIPE); + WARN_ON(pipe->nrbufs == pipe->buffers); + i->type = direction; + i->pipe = pipe; + i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_pipe); + unsigned long iov_iter_alignment(const struct iov_iter *i) { unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; - + if (unlikely(i->type & ITER_PIPE)) { + if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + return size | i->iov_offset; + return size; + } iterate_all_kinds(i, size, v, (res |= (unsigned long)v.iov_base | v.iov_len, 0), res |= v.bv_offset | v.bv_len, @@ -520,10 +863,13 @@ EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { - unsigned long res = 0; + unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; + + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return ~0U; + } iterate_all_kinds(i, size, v, (res |= (!res ? 0 : (unsigned long)v.iov_base) | @@ -533,20 +879,63 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0)) ); - return res; + return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); -ssize_t iov_iter_get_pages(struct iov_iter *i, +static inline size_t __pipe_get_pages(struct iov_iter *i, + size_t maxsize, + struct page **pages, + int idx, + size_t *start) +{ + struct pipe_inode_info *pipe = i->pipe; + ssize_t n = push_pipe(i, maxsize, &idx, start); + if (!n) + return -EFAULT; + + maxsize = n; + n += *start; + while (n > 0) { + get_page(*pages++ = pipe->bufs[idx].page); + idx = next_idx(idx, pipe); + n -= PAGE_SIZE; + } + + return maxsize; +} + +static ssize_t pipe_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { - if (maxsize > i->count) - maxsize = i->count; + unsigned npages; + size_t capacity; + int idx; if (!maxsize) return 0; + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + capacity = min(npages,maxpages) * PAGE_SIZE - *start; + + return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start); +} + +ssize_t iov_iter_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + if (maxsize > i->count) + maxsize = i->count; + + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -582,6 +971,40 @@ static struct page **get_pages_array(size_t n) return p; } +static ssize_t pipe_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + size_t n; + int idx; + int npages; + + if (!maxsize) + return 0; + + if (!sanity(i)) + return -EFAULT; + + data_start(i, &idx, start); + /* some of this one + all after this one */ + npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1; + n = npages * PAGE_SIZE - *start; + if (maxsize > n) + maxsize = n; + else + npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); + p = get_pages_array(npages); + if (!p) + return -ENOMEM; + n = __pipe_get_pages(i, maxsize, p, idx, start); + if (n > 0) + *pages = p; + else + kvfree(p); + return n; +} + ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) @@ -591,9 +1014,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (!maxsize) - return 0; - + if (unlikely(i->type & ITER_PIPE)) + return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ unsigned long addr = (unsigned long)v.iov_base; size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); @@ -635,9 +1057,13 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; - next = csum_and_copy_from_user(v.iov_base, + next = csum_and_copy_from_user(v.iov_base, (to += v.iov_len) - v.iov_len, v.iov_len, 0, &err); if (!err) { @@ -666,6 +1092,51 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter); +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + iterate_all_kinds(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (err) + return false; + sum = csum_block_add(sum, next, off); + off += v.iov_len; + 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(csum_and_copy_from_iter_full); + size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { @@ -673,10 +1144,14 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, __wsum sum, next; size_t off = 0; sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); /* for now */ + return 0; + } iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, + v.iov_base, v.iov_len, 0, &err); if (!err) { sum = csum_block_add(sum, next, off); @@ -712,7 +1187,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) if (!size) return 0; - iterate_all_kinds(i, size, v, ({ + if (unlikely(i->type & ITER_PIPE)) { + struct pipe_inode_info *pipe = i->pipe; + size_t off; + int idx; + + if (!sanity(i)) + return 0; + + data_start(i, &idx, &off); + /* some of this one + all after this one */ + npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1; + if (npages >= maxpages) + return maxpages; + } else iterate_all_kinds(i, size, v, ({ unsigned long p = (unsigned long)v.iov_base; npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - p / PAGE_SIZE; @@ -737,6 +1225,10 @@ EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; + if (unlikely(new->type & ITER_PIPE)) { + WARN_ON(1); + return NULL; + } if (new->type & ITER_BVEC) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), @@ -749,6 +1241,28 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) } EXPORT_SYMBOL(dup_iter); +/** + * import_iovec() - Copy an array of &struct iovec from userspace + * into the kernel, check that it is valid, and initialize a new + * &struct iov_iter iterator to access it. + * + * @type: One of %READ or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @iov. + * @iov: (input and output parameter) Pointer to pointer to (usually small + * on-stack) kernel array. + * @i: Pointer to iterator that will be initialized on success. + * + * If the array pointed to by *@iov is large enough to hold all @nr_segs, + * then this function places %NULL in *@iov on return. Otherwise, a new + * array will be allocated and the result placed in *@iov. This means that + * the caller may call kfree() on *@iov regardless of whether the small + * on-stack array was used or not (and regardless of whether this function + * returns an error or not). + * + * Return: 0 on success or negative error code on error. + */ int import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i) diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2be55692aa43..1d6565e81030 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -74,7 +74,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index f6c2c1e7779c..9a2b811966eb 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -56,7 +56,7 @@ static const char *kobject_actions[] = { * kobject_action_type - translate action string to numeric type * * @buf: buffer containing the action string, newline is ignored - * @len: length of buffer + * @count: length of buffer * @type: pointer to the location to store the action type * * Returns 0 if the action string was recognized. @@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info) /** * kobject_uevent_env - send an uevent with environmental data * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the @@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. diff --git a/lib/kstrtox.c b/lib/kstrtox.c index d8a5cf66c316..bf85e05ce858 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -17,7 +17,7 @@ #include <linux/math64.h> #include <linux/export.h> #include <linux/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "kstrtox.h" const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) @@ -48,11 +48,9 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long { unsigned long long res; unsigned int rv; - int overflow; res = 0; rv = 0; - overflow = 0; while (*s) { unsigned int val; @@ -71,15 +69,13 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long */ if (unlikely(res & (~0ull << 60))) { if (res > div_u64(ULLONG_MAX - val, base)) - overflow = 1; + rv |= KSTRTOX_OVERFLOW; } res = res * base + val; rv++; s++; } *p = res; - if (overflow) - rv |= KSTRTOX_OVERFLOW; return rv; } diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include <linux/export.h> @@ -13,88 +12,48 @@ #include <linux/rculist.h> /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - WRITE_ONCE(prev->next, new); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + + return true; } -EXPORT_SYMBOL(__list_add); +EXPORT_SYMBOL(__list_add_valid); -void __list_del_entry(struct list_head *entry) +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; - - __list_del(prev, next); -} -EXPORT_SYMBOL(__list_del_entry); + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return true; -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; } -EXPORT_SYMBOL(__list_add_rcu); +EXPORT_SYMBOL(__list_del_entry_valid); diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 872a15a2a637..f3a217ea0388 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -980,23 +980,23 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) #ifndef CONFIG_PROVE_LOCKING if (expected == FAILURE && debug_locks) { expected_testcase_failures++; - printk("failed|"); + pr_cont("failed|"); } else #endif if (debug_locks != expected) { unexpected_testcase_failures++; - printk("FAILED|"); + pr_cont("FAILED|"); dump_stack(); } else { testcase_successes++; - printk(" ok |"); + pr_cont(" ok |"); } testcase_total++; if (debug_locks_verbose) - printk(" lockclass mask: %x, debug_locks: %d, expected: %d\n", + pr_cont(" lockclass mask: %x, debug_locks: %d, expected: %d\n", lockclass_mask, debug_locks, expected); /* * Some tests (e.g. double-unlock) might corrupt the preemption @@ -1021,26 +1021,26 @@ static inline void print_testname(const char *testname) #define DO_TESTCASE_1(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_1B(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_##nr, FAILURE, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_3(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN); \ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_3RW(desc, name, nr) \ print_testname(desc"/"#nr); \ dotest(name##_spin_##nr, FAILURE, LOCKTYPE_SPIN|LOCKTYPE_RWLOCK);\ dotest(name##_wlock_##nr, FAILURE, LOCKTYPE_RWLOCK); \ dotest(name##_rlock_##nr, SUCCESS, LOCKTYPE_RWLOCK); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_6(desc, name) \ print_testname(desc); \ @@ -1050,7 +1050,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_6_SUCCESS(desc, name) \ print_testname(desc); \ @@ -1060,7 +1060,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, SUCCESS, LOCKTYPE_MUTEX); \ dotest(name##_wsem, SUCCESS, LOCKTYPE_RWSEM); \ dotest(name##_rsem, SUCCESS, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); /* * 'read' variant: rlocks must not trigger. @@ -1073,7 +1073,7 @@ static inline void print_testname(const char *testname) dotest(name##_mutex, FAILURE, LOCKTYPE_MUTEX); \ dotest(name##_wsem, FAILURE, LOCKTYPE_RWSEM); \ dotest(name##_rsem, FAILURE, LOCKTYPE_RWSEM); \ - printk("\n"); + pr_cont("\n"); #define DO_TESTCASE_2I(desc, name, nr) \ DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ @@ -1726,25 +1726,25 @@ static void ww_tests(void) dotest(ww_test_fail_acquire, SUCCESS, LOCKTYPE_WW); dotest(ww_test_normal, SUCCESS, LOCKTYPE_WW); dotest(ww_test_unneeded_slow, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("ww contexts mixing"); dotest(ww_test_two_contexts, FAILURE, LOCKTYPE_WW); dotest(ww_test_diff_class, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("finishing ww context"); dotest(ww_test_context_done_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_unlock_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_fini_early, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_lock_after_done, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("locking mismatches"); dotest(ww_test_object_unlock_twice, FAILURE, LOCKTYPE_WW); dotest(ww_test_object_lock_unbalanced, FAILURE, LOCKTYPE_WW); dotest(ww_test_object_lock_stale_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("EDEADLK handling"); dotest(ww_test_edeadlk_normal, SUCCESS, LOCKTYPE_WW); @@ -1757,11 +1757,11 @@ static void ww_tests(void) dotest(ww_test_edeadlk_acquire_more_edeadlk_slow, FAILURE, LOCKTYPE_WW); dotest(ww_test_edeadlk_acquire_wrong, FAILURE, LOCKTYPE_WW); dotest(ww_test_edeadlk_acquire_wrong_slow, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("spinlock nest unlocked"); dotest(ww_test_spin_nest_unlocked, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); printk(" -----------------------------------------------------\n"); printk(" |block | try |context|\n"); @@ -1771,25 +1771,25 @@ static void ww_tests(void) dotest(ww_test_context_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_context_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_context_context, SUCCESS, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("try"); dotest(ww_test_try_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_try_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_try_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("block"); dotest(ww_test_block_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_block_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_block_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); print_testname("spinlock"); dotest(ww_test_spin_block, FAILURE, LOCKTYPE_WW); dotest(ww_test_spin_try, SUCCESS, LOCKTYPE_WW); dotest(ww_test_spin_context, FAILURE, LOCKTYPE_WW); - printk("\n"); + pr_cont("\n"); } void locking_selftest(void) @@ -1829,32 +1829,32 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); print_testname("recursive read-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA1, SUCCESS, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA1, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("recursive read-lock #2"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA1B, SUCCESS, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA1B, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("mixed read-write-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA2, FAILURE, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA2, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); print_testname("mixed write-read-lock"); - printk(" |"); + pr_cont(" |"); dotest(rlock_AA3, FAILURE, LOCKTYPE_RWLOCK); - printk(" |"); + pr_cont(" |"); dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM); - printk("\n"); + pr_cont("\n"); printk(" --------------------------------------------------------------------------\n"); diff --git a/lib/lockref.c b/lib/lockref.c index 5a92189ad711..c4bfcb8836cd 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -20,7 +20,7 @@ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } \ } while (0) diff --git a/lib/mpi/mpi-pow.c b/lib/mpi/mpi-pow.c index 5464c8744ea9..e24388a863a7 100644 --- a/lib/mpi/mpi-pow.c +++ b/lib/mpi/mpi-pow.c @@ -64,8 +64,13 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod) if (!esize) { /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 * depending on if MOD equals 1. */ - rp[0] = 1; res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1; + if (res->nlimbs) { + if (mpi_resize(res, 1) < 0) + goto enomem; + rp = res->d; + rp[0] = 1; + } res->sign = 0; goto leave; } diff --git a/lib/nlattr.c b/lib/nlattr.c index fce1e9afc6d9..b42b8577fc23 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -14,7 +14,7 @@ #include <linux/types.h> #include <net/netlink.h> -static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = { +static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 26caf51cc238..5f7999eacad5 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,21 +16,23 @@ #include <linux/delay.h> #include <linux/kprobes.h> #include <linux/nmi.h> +#include <linux/cpu.h> -#ifdef arch_trigger_all_cpu_backtrace +#ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +/* "in progress" flag of arch_trigger_cpumask_backtrace */ static unsigned long backtrace_flag; /* - * When raise() is called it will be is passed a pointer to the + * When raise() is called it will be passed a pointer to the * backtrace_mask. Architectures that call nmi_cpu_backtrace() * directly from their raise() functions may rely on the mask * they are passed being updated as a side effect of this call. */ -void nmi_trigger_all_cpu_backtrace(bool include_self, +void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self, void (*raise)(cpumask_t *mask)) { int i, this_cpu = get_cpu(); @@ -44,13 +46,22 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, return; } - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) + cpumask_copy(to_cpumask(backtrace_mask), mask); + if (exclude_self) cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); + /* + * Don't try to send an NMI to this cpu; it may work on some + * architectures, but on others it may not, and we'll get + * information at least as useful just by doing a dump_stack() here. + * Note that nmi_cpu_backtrace(NULL) will clear the cpu bit. + */ + if (cpumask_test_cpu(this_cpu, to_cpumask(backtrace_mask))) + nmi_cpu_backtrace(NULL); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("Sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); + pr_info("Sending NMI from CPU %d to CPUs %*pbl:\n", + this_cpu, nr_cpumask_bits, to_cpumask(backtrace_mask)); raise(to_cpumask(backtrace_mask)); } @@ -66,7 +77,7 @@ void nmi_trigger_all_cpu_backtrace(bool include_self, * Force flush any remote buffers that might be stuck in IRQ context * and therefore could not run their irq_work. */ - printk_nmi_flush(); + printk_safe_flush(); clear_bit_unlock(0, &backtrace_flag); put_cpu(); @@ -77,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - pr_warn("NMI backtrace for cpu %d\n", cpu); - if (regs) - show_regs(regs); - else - dump_stack(); + if (regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", + cpu, instruction_pointer(regs)); + } else { + pr_warn("NMI backtrace for cpu %d\n", cpu); + if (regs) + show_regs(regs); + else + dump_stack(); + } cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/lib/parman.c b/lib/parman.c new file mode 100644 index 000000000000..c6e42a8db824 --- /dev/null +++ b/lib/parman.c @@ -0,0 +1,376 @@ +/* + * lib/parman.c - Manager for linear priority array areas + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/list.h> +#include <linux/err.h> +#include <linux/parman.h> + +struct parman_algo { + int (*item_add)(struct parman *parman, struct parman_prio *prio, + struct parman_item *item); + void (*item_remove)(struct parman *parman, struct parman_prio *prio, + struct parman_item *item); +}; + +struct parman { + const struct parman_ops *ops; + void *priv; + const struct parman_algo *algo; + unsigned long count; + unsigned long limit_count; + struct list_head prio_list; +}; + +static int parman_enlarge(struct parman *parman) +{ + unsigned long new_count = parman->limit_count + + parman->ops->resize_step; + int err; + + err = parman->ops->resize(parman->priv, new_count); + if (err) + return err; + parman->limit_count = new_count; + return 0; +} + +static int parman_shrink(struct parman *parman) +{ + unsigned long new_count = parman->limit_count - + parman->ops->resize_step; + int err; + + if (new_count < parman->ops->base_count) + return 0; + err = parman->ops->resize(parman->priv, new_count); + if (err) + return err; + parman->limit_count = new_count; + return 0; +} + +static bool parman_prio_used(struct parman_prio *prio) + +{ + return !list_empty(&prio->item_list); +} + +static struct parman_item *parman_prio_first_item(struct parman_prio *prio) +{ + return list_first_entry(&prio->item_list, + typeof(struct parman_item), list); +} + +static unsigned long parman_prio_first_index(struct parman_prio *prio) +{ + return parman_prio_first_item(prio)->index; +} + +static struct parman_item *parman_prio_last_item(struct parman_prio *prio) +{ + return list_last_entry(&prio->item_list, + typeof(struct parman_item), list); +} + +static unsigned long parman_prio_last_index(struct parman_prio *prio) +{ + return parman_prio_last_item(prio)->index; +} + +static unsigned long parman_lsort_new_index_find(struct parman *parman, + struct parman_prio *prio) +{ + list_for_each_entry_from_reverse(prio, &parman->prio_list, list) { + if (!parman_prio_used(prio)) + continue; + return parman_prio_last_index(prio) + 1; + } + return 0; +} + +static void __parman_prio_move(struct parman *parman, struct parman_prio *prio, + struct parman_item *item, unsigned long to_index, + unsigned long count) +{ + parman->ops->move(parman->priv, item->index, to_index, count); +} + +static void parman_prio_shift_down(struct parman *parman, + struct parman_prio *prio) +{ + struct parman_item *item; + unsigned long to_index; + + if (!parman_prio_used(prio)) + return; + item = parman_prio_first_item(prio); + to_index = parman_prio_last_index(prio) + 1; + __parman_prio_move(parman, prio, item, to_index, 1); + list_move_tail(&item->list, &prio->item_list); + item->index = to_index; +} + +static void parman_prio_shift_up(struct parman *parman, + struct parman_prio *prio) +{ + struct parman_item *item; + unsigned long to_index; + + if (!parman_prio_used(prio)) + return; + item = parman_prio_last_item(prio); + to_index = parman_prio_first_index(prio) - 1; + __parman_prio_move(parman, prio, item, to_index, 1); + list_move(&item->list, &prio->item_list); + item->index = to_index; +} + +static void parman_prio_item_remove(struct parman *parman, + struct parman_prio *prio, + struct parman_item *item) +{ + struct parman_item *last_item; + unsigned long to_index; + + last_item = parman_prio_last_item(prio); + if (last_item == item) { + list_del(&item->list); + return; + } + to_index = item->index; + __parman_prio_move(parman, prio, last_item, to_index, 1); + list_del(&last_item->list); + list_replace(&item->list, &last_item->list); + last_item->index = to_index; +} + +static int parman_lsort_item_add(struct parman *parman, + struct parman_prio *prio, + struct parman_item *item) +{ + struct parman_prio *prio2; + unsigned long new_index; + int err; + + if (parman->count + 1 > parman->limit_count) { + err = parman_enlarge(parman); + if (err) + return err; + } + + new_index = parman_lsort_new_index_find(parman, prio); + list_for_each_entry_reverse(prio2, &parman->prio_list, list) { + if (prio2 == prio) + break; + parman_prio_shift_down(parman, prio2); + } + item->index = new_index; + list_add_tail(&item->list, &prio->item_list); + parman->count++; + return 0; +} + +static void parman_lsort_item_remove(struct parman *parman, + struct parman_prio *prio, + struct parman_item *item) +{ + parman_prio_item_remove(parman, prio, item); + list_for_each_entry_continue(prio, &parman->prio_list, list) + parman_prio_shift_up(parman, prio); + parman->count--; + if (parman->limit_count - parman->count >= parman->ops->resize_step) + parman_shrink(parman); +} + +static const struct parman_algo parman_lsort = { + .item_add = parman_lsort_item_add, + .item_remove = parman_lsort_item_remove, +}; + +static const struct parman_algo *parman_algos[] = { + &parman_lsort, +}; + +/** + * parman_create - creates a new parman instance + * @ops: caller-specific callbacks + * @priv: pointer to a private data passed to the ops + * + * Note: all locking must be provided by the caller. + * + * Each parman instance manages an array area with chunks of entries + * with the same priority. Consider following example: + * + * item 1 with prio 10 + * item 2 with prio 10 + * item 3 with prio 10 + * item 4 with prio 20 + * item 5 with prio 20 + * item 6 with prio 30 + * item 7 with prio 30 + * item 8 with prio 30 + * + * In this example, there are 3 priority chunks. The order of the priorities + * matters, however the order of items within a single priority chunk does not + * matter. So the same array could be ordered as follows: + * + * item 2 with prio 10 + * item 3 with prio 10 + * item 1 with prio 10 + * item 5 with prio 20 + * item 4 with prio 20 + * item 7 with prio 30 + * item 8 with prio 30 + * item 6 with prio 30 + * + * The goal of parman is to maintain the priority ordering. The caller + * provides @ops with callbacks parman uses to move the items + * and resize the array area. + * + * Returns a pointer to newly created parman instance in case of success, + * otherwise it returns NULL. + */ +struct parman *parman_create(const struct parman_ops *ops, void *priv) +{ + struct parman *parman; + + parman = kzalloc(sizeof(*parman), GFP_KERNEL); + if (!parman) + return NULL; + INIT_LIST_HEAD(&parman->prio_list); + parman->ops = ops; + parman->priv = priv; + parman->limit_count = ops->base_count; + parman->algo = parman_algos[ops->algo]; + return parman; +} +EXPORT_SYMBOL(parman_create); + +/** + * parman_destroy - destroys existing parman instance + * @parman: parman instance + * + * Note: all locking must be provided by the caller. + */ +void parman_destroy(struct parman *parman) +{ + WARN_ON(!list_empty(&parman->prio_list)); + kfree(parman); +} +EXPORT_SYMBOL(parman_destroy); + +/** + * parman_prio_init - initializes a parman priority chunk + * @parman: parman instance + * @prio: parman prio structure to be initialized + * @prority: desired priority of the chunk + * + * Note: all locking must be provided by the caller. + * + * Before caller could add an item with certain priority, he has to + * initialize a priority chunk for it using this function. + */ +void parman_prio_init(struct parman *parman, struct parman_prio *prio, + unsigned long priority) +{ + struct parman_prio *prio2; + struct list_head *pos; + + INIT_LIST_HEAD(&prio->item_list); + prio->priority = priority; + + /* Position inside the list according to priority */ + list_for_each(pos, &parman->prio_list) { + prio2 = list_entry(pos, typeof(*prio2), list); + if (prio2->priority > prio->priority) + break; + } + list_add_tail(&prio->list, pos); +} +EXPORT_SYMBOL(parman_prio_init); + +/** + * parman_prio_fini - finalizes use of parman priority chunk + * @prio: parman prio structure + * + * Note: all locking must be provided by the caller. + */ +void parman_prio_fini(struct parman_prio *prio) +{ + WARN_ON(parman_prio_used(prio)); + list_del(&prio->list); +} +EXPORT_SYMBOL(parman_prio_fini); + +/** + * parman_item_add - adds a parman item under defined priority + * @parman: parman instance + * @prio: parman prio instance to add the item to + * @item: parman item instance + * + * Note: all locking must be provided by the caller. + * + * Adds item to a array managed by parman instance under the specified priority. + * + * Returns 0 in case of success, negative number to indicate an error. + */ +int parman_item_add(struct parman *parman, struct parman_prio *prio, + struct parman_item *item) +{ + return parman->algo->item_add(parman, prio, item); +} +EXPORT_SYMBOL(parman_item_add); + +/** + * parman_item_del - deletes parman item + * @parman: parman instance + * @prio: parman prio instance to delete the item from + * @item: parman item instance + * + * Note: all locking must be provided by the caller. + */ +void parman_item_remove(struct parman *parman, struct parman_prio *prio, + struct parman_item *item) +{ + parman->algo->item_remove(parman, prio, item); +} +EXPORT_SYMBOL(parman_item_remove); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Priority-based array manager"); diff --git a/lib/parser.c b/lib/parser.c index b6d11631231b..3278958b472a 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base) } /** + * match_u64int: scan a number in the given base from a substring_t + * @s: substring to be scanned + * @result: resulting u64 on success + * @base: base to use when converting string + * + * Description: Given a &substring_t and a base, attempts to parse the substring + * as a number in that base. On success, sets @result to the integer represented + * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +static int match_u64int(substring_t *s, u64 *result, int base) +{ + char *buf; + int ret; + u64 val; + size_t len = s->to - s->from; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, s->from, len); + buf[len] = '\0'; + + ret = kstrtoull(buf, base, &val); + if (!ret) + *result = val; + kfree(buf); + return ret; +} + +/** * match_int: - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success @@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result) EXPORT_SYMBOL(match_int); /** + * match_u64: - scan a decimal representation of a u64 from + * a substring_t + * @s: substring_t to be scanned + * @result: resulting unsigned long long on success + * + * Description: Attempts to parse the &substring_t @s as a long decimal + * integer. On success, sets @result to the integer represented by the + * string and returns 0. + * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +int match_u64(substring_t *s, u64 *result) +{ + return match_u64int(s, result, 0); +} +EXPORT_SYMBOL(match_u64); + +/** * match_octal: - scan an octal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 27fe74948882..9ac959ef4cae 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -33,6 +33,7 @@ #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) +static DEFINE_SPINLOCK(percpu_ref_switch_lock); static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) @@ -82,6 +83,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, atomic_long_set(&ref->count, start_count); ref->release = release; + ref->confirm_switch = NULL; return 0; } EXPORT_SYMBOL_GPL(percpu_ref_init); @@ -101,6 +103,8 @@ void percpu_ref_exit(struct percpu_ref *ref) unsigned long __percpu *percpu_count = percpu_count_ptr(ref); if (percpu_count) { + /* non-NULL confirm_switch indicates switching in progress */ + WARN_ON_ONCE(ref->confirm_switch); free_percpu(percpu_count); ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } @@ -161,66 +165,23 @@ static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch) { - if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { - /* switching from percpu to atomic */ - ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; - - /* - * Non-NULL ->confirm_switch is used to indicate that - * switching is in progress. Use noop one if unspecified. - */ - WARN_ON_ONCE(ref->confirm_switch); - ref->confirm_switch = - confirm_switch ?: percpu_ref_noop_confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); - } else if (confirm_switch) { - /* - * Somebody already set ATOMIC. Switching may still be in - * progress. @confirm_switch must be invoked after the - * switching is complete and a full sched RCU grace period - * has passed. Wait synchronously for the previous - * switching and schedule @confirm_switch invocation. - */ - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - ref->confirm_switch = confirm_switch; - - percpu_ref_get(ref); /* put after confirmation */ - call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + if (ref->percpu_count_ptr & __PERCPU_REF_ATOMIC) { + if (confirm_switch) + confirm_switch(ref); + return; } -} -/** - * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode - * @ref: percpu_ref to switch to atomic mode - * @confirm_switch: optional confirmation callback - * - * There's no reason to use this function for the usual reference counting. - * Use percpu_ref_kill[_and_confirm](). - * - * Schedule switching of @ref to atomic mode. All its percpu counts will - * be collected to the main atomic counter. On completion, when all CPUs - * are guaraneed to be in atomic mode, @confirm_switch, which may not - * block, is invoked. This function may be invoked concurrently with all - * the get/put operations and can safely be mixed with kill and reinit - * operations. Note that @ref will stay in atomic mode across kill/reinit - * cycles until percpu_ref_switch_to_percpu() is called. - * - * This function normally doesn't block and can be called from any context - * but it may block if @confirm_kill is specified and @ref is already in - * the process of switching to atomic mode. In such cases, @confirm_switch - * will be invoked after the switching is complete. - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. - */ -void percpu_ref_switch_to_atomic(struct percpu_ref *ref, - percpu_ref_func_t *confirm_switch) -{ - ref->force_atomic = true; - __percpu_ref_switch_to_atomic(ref, confirm_switch); + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that switching is + * in progress. Use noop one if unspecified. + */ + ref->confirm_switch = confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); } static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) @@ -233,8 +194,6 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; - wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); - atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* @@ -250,6 +209,58 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); } +static void __percpu_ref_switch_mode(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + lockdep_assert_held(&percpu_ref_switch_lock); + + /* + * If the previous ATOMIC switching hasn't finished yet, wait for + * its completion. If the caller ensures that ATOMIC switching + * isn't in progress, this function can be called from any context. + */ + wait_event_lock_irq(percpu_ref_switch_waitq, !ref->confirm_switch, + percpu_ref_switch_lock); + + if (ref->force_atomic || (ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_atomic(ref, confirm_switch); + else + __percpu_ref_switch_to_percpu(ref); +} + +/** + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback + * + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. + * + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. + */ +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + + ref->force_atomic = true; + __percpu_ref_switch_mode(ref, confirm_switch); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); +} + /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode * @ref: percpu_ref to switch to percpu mode @@ -264,17 +275,20 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) * dying or dead, the actual switching takes place on the following * percpu_ref_reinit(). * - * This function normally doesn't block and can be called from any context - * but it may block if @ref is in the process of switching to atomic mode - * by percpu_ref_switch_atomic(). + * This function may block if @ref is in the process of switching to atomic + * mode. If the caller ensures that @ref is not in the process of + * switching to atomic mode, this function can be called from any context. */ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + ref->force_atomic = false; + __percpu_ref_switch_mode(ref, NULL); - /* a dying or dead ref can't be switched to percpu mode w/o reinit */ - if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) - __percpu_ref_switch_to_percpu(ref); + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } /** @@ -290,21 +304,23 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) * * This function normally doesn't block and can be called from any context * but it may block if @confirm_kill is specified and @ref is in the - * process of switching to atomic mode by percpu_ref_switch_atomic(). - * - * Due to the way percpu_ref is implemented, @confirm_switch will be called - * after at least one full sched RCU grace period has passed but this is an - * implementation detail and must not be depended upon. + * process of switching to atomic mode by percpu_ref_switch_to_atomic(). */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, "%s called more than once on %pf!", __func__, ref->release); ref->percpu_count_ptr |= __PERCPU_REF_DEAD; - __percpu_ref_switch_to_atomic(ref, confirm_kill); + __percpu_ref_switch_mode(ref, confirm_kill); percpu_ref_put(ref); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); @@ -321,11 +337,16 @@ EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); */ void percpu_ref_reinit(struct percpu_ref *ref) { + unsigned long flags; + + spin_lock_irqsave(&percpu_ref_switch_lock, flags); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; percpu_ref_get(ref); - if (!ref->force_atomic) - __percpu_ref_switch_to_percpu(ref); + __percpu_ref_switch_mode(ref, NULL); + + spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } EXPORT_SYMBOL_GPL(percpu_ref_reinit); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 72d36113ccaa..c8cebb137076 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); -static void compute_batch_value(void) +static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); + return 0; } -static int percpu_counter_hotcpu_callback(struct notifier_block *nb, - unsigned long action, void *hcpu) +static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU - unsigned int cpu; struct percpu_counter *fbc; - compute_batch_value(); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; + compute_batch_value(cpu); - cpu = (unsigned long)hcpu; spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; @@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, } spin_unlock_irq(&percpu_counters_lock); #endif - return NOTIFY_OK; + return 0; } /* @@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { - compute_batch_value(); - hotcpu_notifier(percpu_counter_hotcpu_callback, 0); + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", + compute_batch_value, NULL); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, + "lib/percpu_cnt:dead", NULL, + percpu_counter_cpu_dead); + WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 91f0727e3cad..84812a9fb16f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -22,6 +22,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> @@ -30,7 +31,6 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <linux/kmemleak.h> -#include <linux/notifier.h> #include <linux/cpu.h> #include <linux/string.h> #include <linux/bitops.h> @@ -69,6 +69,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -191,13 +196,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long -radix_tree_find_next_bit(const unsigned long *addr, - unsigned long size, unsigned long offset) +radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, + unsigned long offset) { - if (!__builtin_constant_p(size)) - return find_next_bit(addr, size, offset); + const unsigned long *addr = node->tags[tag]; - if (offset < size) { + if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; @@ -205,14 +209,32 @@ radix_tree_find_next_bit(const unsigned long *addr, if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); - while (offset < size) { + while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } - return size; + return RADIX_TREE_MAP_SIZE; +} + +static unsigned int iter_offset(const struct radix_tree_iter *iter) +{ + return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; +} + +/* + * The maximum index which can be stored in a radix tree + */ +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + +static inline unsigned long node_maxindex(struct radix_tree_node *node) +{ + return shift_maxindex(node->shift); } #ifndef __KERNEL__ @@ -220,10 +242,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", - node, node->offset, + pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", + node, node->offset, index, index | node_maxindex(node), + node->parent, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -231,14 +254,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) void *entry = node->slots[i]; if (!entry) continue; - if (is_sibling_entry(node, entry)) { - pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", - entry, i, - *(void **)entry_to_node(entry), - first, last); + if (entry == RADIX_TREE_RETRY) { + pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", + i, first, last, node); } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %ld-%ld\n", - entry, i, first, last); + pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", + entry, i, first, last, node); + } else if (is_sibling_entry(node, entry)) { + pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", + entry, i, first, last, node, + *(void **)entry_to_node(entry)); } else { dump_node(entry_to_node(entry), first); } @@ -262,7 +287,10 @@ static void radix_tree_dump(struct radix_tree_root *root) * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * -radix_tree_node_alloc(struct radix_tree_root *root) +radix_tree_node_alloc(struct radix_tree_root *root, + struct radix_tree_node *parent, + unsigned int shift, unsigned int offset, + unsigned int count, unsigned int exceptional) { struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); @@ -307,6 +335,13 @@ radix_tree_node_alloc(struct radix_tree_root *root) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); + if (ret) { + ret->parent = parent; + ret->shift = shift; + ret->offset = offset; + ret->count = count; + ret->exceptional = exceptional; + } return ret; } @@ -314,18 +349,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); - int i; /* - * must only free zeroed nodes into the slab. radix_tree_shrink - * can leave us with a non-NULL entry in the first slot, so clear - * that here to make sure. + * Must only free zeroed nodes into the slab. We can be left with + * non-NULL entries by radix_tree_free_nodes, so clear the entries + * and tags here. */ - for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) - tag_clear(node, i, 0); - - node->slots[0] = NULL; - node->count = 0; + memset(node->slots, 0, sizeof(node->slots)); + memset(node->tags, 0, sizeof(node->tags)); + INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } @@ -345,7 +377,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask, int nr) +static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -411,6 +443,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* + * Preload with enough objects to ensure that we can split a single entry + * of order @old_order into many entries of size @new_order + */ +int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, + gfp_t gfp_mask) +{ + unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); + unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - + (new_order / RADIX_TREE_MAP_SHIFT); + unsigned nr = 0; + + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); + BUG_ON(new_order >= old_order); + + while (layers--) + nr = nr * RADIX_TREE_MAP_SIZE + 1; + return __radix_tree_preload(gfp_mask, top * nr); +} +#endif + /* * The same as function above, but preload number of nodes required to insert * (1 << order) continuous naturally-aligned elements. @@ -456,19 +510,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) return __radix_tree_preload(gfp_mask, nr_nodes); } -/* - * The maximum index which can be stored in a radix tree - */ -static inline unsigned long shift_maxindex(unsigned int shift) -{ - return (RADIX_TREE_MAP_SIZE << shift) - 1; -} - -static inline unsigned long node_maxindex(struct radix_tree_node *node) -{ - return shift_maxindex(node->shift); -} - static unsigned radix_tree_load_root(struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { @@ -506,8 +547,8 @@ static int radix_tree_extend(struct radix_tree_root *root, goto out; do { - struct radix_tree_node *node = radix_tree_node_alloc(root); - + struct radix_tree_node *node = radix_tree_node_alloc(root, + NULL, shift, 0, 1, 0); if (!node) return -ENOMEM; @@ -518,12 +559,12 @@ static int radix_tree_extend(struct radix_tree_root *root, } BUG_ON(shift > BITS_PER_LONG); - node->shift = shift; - node->offset = 0; - node->count = 1; - node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else if (radix_tree_exceptional_entry(slot)) { + /* Moving an exceptional root->rnode to a node */ + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -534,6 +575,106 @@ out: } /** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) +{ + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + node->count = 0; + if (!radix_tree_is_internal_node(child)) { + node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } + + WARN_ON_ONCE(!list_empty(&node->private_list)); + radix_tree_node_free(node); + } +} + +static void delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) +{ + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + radix_tree_shrink(root, update_node, private); + return; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + WARN_ON_ONCE(!list_empty(&node->private_list)); + radix_tree_node_free(node); + + node = parent; + } while (node); +} + +/** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key @@ -563,26 +704,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ + if (order > 0 && max == ((1UL << order) - 1)) + max++; if (max > maxindex) { int error = radix_tree_extend(root, max, shift); if (error < 0) return error; shift = error; child = root->rnode; - if (order == shift) - shift += RADIX_TREE_MAP_SHIFT; } while (shift > order) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ - child = radix_tree_node_alloc(root); + child = radix_tree_node_alloc(root, node, shift, + offset, 0, 0); if (!child) return -ENOMEM; - child->shift = shift; - child->offset = offset; - child->parent = node; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; @@ -595,31 +734,126 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = &node->slots[offset]; } + if (nodep) + *nodep = node; + if (slotp) + *slotp = slot; + return 0; +} + #ifdef CONFIG_RADIX_TREE_MULTIORDER - /* Insert pointers to the canonical entry */ - if (order > shift) { - unsigned i, n = 1 << (order - shift); +/* + * Free any nodes below this node. The tree is presumed to not need + * shrinking, and any user data in the tree is presumed to not need a + * destructor called on it. If we need to add a destructor, we can + * add that functionality later. Note that we may not clear tags or + * slots from the tree as an RCU walker may still have a pointer into + * this subtree. We could replace the entries with RADIX_TREE_RETRY, + * but we'll still have to clear those in rcu_free. + */ +static void radix_tree_free_nodes(struct radix_tree_node *node) +{ + unsigned offset = 0; + struct radix_tree_node *child = entry_to_node(node); + + for (;;) { + void *entry = child->slots[offset]; + if (radix_tree_is_internal_node(entry) && + !is_sibling_entry(child, entry)) { + child = entry_to_node(entry); + offset = 0; + continue; + } + offset++; + while (offset == RADIX_TREE_MAP_SIZE) { + struct radix_tree_node *old = child; + offset = child->offset + 1; + child = child->parent; + WARN_ON_ONCE(!list_empty(&old->private_list)); + radix_tree_node_free(old); + if (old == entry_to_node(node)) + return; + } + } +} + +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + struct radix_tree_node *child; + unsigned i, n, tag, offset, tags = 0; + + if (node) { + if (order > node->shift) + n = 1 << (order - node->shift); + else + n = 1; + offset = get_slot_offset(node, slot); + } else { + n = 1; + offset = 0; + } + + if (n > 1) { offset = offset & ~(n - 1); slot = &node->slots[offset]; - child = node_to_entry(slot); - for (i = 0; i < n; i++) { - if (slot[i]) + } + child = node_to_entry(slot); + + for (i = 0; i < n; i++) { + if (slot[i]) { + if (replace) { + node->count--; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(node, tag, offset + i)) + tags |= 1 << tag; + } else return -EEXIST; } + } - for (i = 1; i < n; i++) { + for (i = 0; i < n; i++) { + struct radix_tree_node *old = slot[i]; + if (i) { rcu_assign_pointer(slot[i], child); - node->count++; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_clear(node, tag, offset + i); + } else { + rcu_assign_pointer(slot[i], item); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); } + if (radix_tree_is_internal_node(old) && + !is_sibling_entry(node, old) && + (old != RADIX_TREE_RETRY)) + radix_tree_free_nodes(old); + if (radix_tree_exceptional_entry(old)) + node->exceptional--; } -#endif - - if (nodep) - *nodep = node; - if (slotp) - *slotp = slot; - return 0; + if (node) { + node->count += n; + if (radix_tree_exceptional_entry(item)) + node->exceptional += n; + } + return n; } +#else +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + if (*slot) + return -EEXIST; + rcu_assign_pointer(*slot, item); + if (node) { + node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; + } + return 1; +} +#endif /** * __radix_tree_insert - insert into a radix tree @@ -642,13 +876,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, error = __radix_tree_create(root, index, order, &node, &slot); if (error) return error; - if (*slot != NULL) - return -EEXIST; - rcu_assign_pointer(*slot, item); + + error = insert_entries(node, slot, item, order, false); + if (error < 0) + return error; if (node) { unsigned offset = get_slot_offset(node, slot); - node->count++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +980,287 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static inline int slot_count(struct radix_tree_node *node, + void **slot) +{ + int n = 1; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + n++; + } +#endif + return n; +} + +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) +{ + void *old = rcu_dereference_raw(*slot); + int count, exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + + count = !!item - !!old; + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); + + if (node) { + node->count += count; + if (exceptional) { + exceptional *= slot_count(node, slot); + node->exceptional += exceptional; + } + } + + rcu_assign_pointer(*slot, item); +} + +static inline void delete_sibling_entries(struct radix_tree_node *node, + void **slot) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + bool exceptional = radix_tree_exceptional_entry(*slot); + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + if (exceptional) + node->exceptional--; + } +#endif +} + +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) +{ + if (!item) + delete_sibling_entries(node, slot); + /* + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); + + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace() or + * radix_tree_iter_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + +/** + * radix_tree_iter_replace - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_split() and radix_tree_for_each_slot(). + * Caller must hold tree write locked across split and replacement. + */ +void radix_tree_iter_replace(struct radix_tree_root *root, + const struct radix_tree_iter *iter, void **slot, void *item) +{ + __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/** + * radix_tree_join - replace multiple entries with one multiorder entry + * @root: radix tree root + * @index: an index inside the new entry + * @order: order of the new entry + * @item: new entry + * + * Call this function to replace several entries with one larger entry. + * The existing entries are presumed to not need freeing as a result of + * this call. + * + * The replacement entry will have all the tags set on it that were set + * on any of the entries it is replacing. + */ +int radix_tree_join(struct radix_tree_root *root, unsigned long index, + unsigned order, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_internal_node(item)); + + error = __radix_tree_create(root, index, order, &node, &slot); + if (!error) + error = insert_entries(node, slot, item, order, true); + if (error > 0) + error = 0; + + return error; +} + +/** + * radix_tree_split - Split an entry into smaller entries + * @root: radix tree root + * @index: An index within the large entry + * @order: Order of new entries + * + * Call this function as the first step in replacing a multiorder entry + * with several entries of lower order. After this function returns, + * loop over the relevant portion of the tree using radix_tree_for_each_slot() + * and call radix_tree_iter_replace() to set up each new entry. + * + * The tags from this entry are replicated to all the new entries. + * + * The radix tree should be locked against modification during the entire + * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which + * should prompt RCU walkers to restart the lookup from the root. + */ +int radix_tree_split(struct radix_tree_root *root, unsigned long index, + unsigned order) +{ + struct radix_tree_node *parent, *node, *child; + void **slot; + unsigned int offset, end; + unsigned n, tag, tags = 0; + + if (!__radix_tree_lookup(root, index, &parent, &slot)) + return -ENOENT; + if (!parent) + return -ENOENT; + + offset = get_slot_offset(parent, slot); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(parent, tag, offset)) + tags |= 1 << tag; + + for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { + if (!is_sibling_entry(parent, parent->slots[end])) + break; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(parent, tag, end); + /* rcu_assign_pointer ensures tags are set before RETRY */ + rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); + } + rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); + parent->exceptional -= (end - offset); + + if (order == parent->shift) + return 0; + if (order > parent->shift) { + while (offset < end) + offset += insert_entries(parent, &parent->slots[offset], + RADIX_TREE_RETRY, order, true); + return 0; + } + + node = parent; + + for (;;) { + if (node->shift > order) { + child = radix_tree_node_alloc(root, node, + node->shift - RADIX_TREE_MAP_SHIFT, + offset, 0, 0); + if (!child) + goto nomem; + if (node != parent) { + node->count++; + node->slots[offset] = node_to_entry(child); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + } + + node = child; + offset = 0; + continue; + } + + n = insert_entries(node, &node->slots[offset], + RADIX_TREE_RETRY, order, false); + BUG_ON(n > RADIX_TREE_MAP_SIZE); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + offset += n; + + while (offset == RADIX_TREE_MAP_SIZE) { + if (node == parent) + break; + offset = node->offset; + child = node; + node = node->parent; + rcu_assign_pointer(node->slots[offset], + node_to_entry(child)); + offset++; + } + if ((node == parent) && (offset == end)) + return 0; + } + + nomem: + /* Shouldn't happen; did user forget to preload? */ + /* TODO: free all the allocated nodes */ + WARN_ON(1); + return -ENOMEM; +} +#endif + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -807,6 +1322,34 @@ static void node_tag_clear(struct radix_tree_root *root, root_tag_clear(root, tag); } +static void node_tag_set(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (tag_get(node, tag, offset)) + return; + tag_set(node, tag, offset); + offset = node->offset; + node = node->parent; + } + + if (!root_tag_get(root, tag)) + root_tag_set(root, tag); +} + +/** + * radix_tree_iter_tag_set - set a tag on the current iterator entry + * @root: radix tree root + * @iter: iterator state + * @tag: tag to set + */ +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag) +{ + node_tag_set(root, iter->node, tag, iter_offset(iter)); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -902,6 +1445,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +/* Construct iter->tags bit-mask from node->tags[tag] array */ +static void set_iter_tags(struct radix_tree_iter *iter, + struct radix_tree_node *node, unsigned offset, + unsigned tag) +{ + unsigned tag_long = offset / BITS_PER_LONG; + unsigned tag_bit = offset % BITS_PER_LONG; + + iter->tags = node->tags[tag][tag_long] >> tag_bit; + + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); + } +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = skip_siblings(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + unsigned long next_index; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + iter->node = node; + iter->shift = node->shift; + + if (flags & RADIX_TREE_ITER_TAGGED) { + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + iter->index = __radix_tree_iter_add(iter, offset); + set_iter_tags(iter, node, offset, tag); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + iter->index = __radix_tree_iter_add(iter, offset); + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + next_index = (iter->index | shift_maxindex(iter->shift)) + 1; + if (next_index < iter->next_index) + iter->next_index = next_index; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#else +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + return slot; +} +#endif + +void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + skip_siblings(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_resume); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -927,7 +1585,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop - * contiguous iterating, and forbid swithing to the next chunk. + * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) @@ -945,6 +1603,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; + iter->node = NULL; __set_iter_shift(iter, 0); return (void **)&root->rnode; } @@ -960,9 +1619,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; if (flags & RADIX_TREE_ITER_TAGGED) - offset = radix_tree_find_next_bit( - node->tags[tag], - RADIX_TREE_MAP_SIZE, + offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { @@ -982,154 +1639,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, child = rcu_dereference_raw(node->slots[offset]); } - if ((child == NULL) || (child == RADIX_TREE_RETRY)) + if (!child) goto restart; + if (child == RADIX_TREE_RETRY) + break; } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); iter->next_index = (index | node_maxindex(node)) + 1; + iter->node = node; __set_iter_shift(iter, node->shift); - /* Construct iter->tags bit-mask from node->tags[tag] array */ - if (flags & RADIX_TREE_ITER_TAGGED) { - unsigned tag_long, tag_bit; - - tag_long = offset / BITS_PER_LONG; - tag_bit = offset % BITS_PER_LONG; - iter->tags = node->tags[tag][tag_long] >> tag_bit; - /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ - if (tag_long < RADIX_TREE_TAG_LONGS - 1) { - /* Pick tags from next element */ - if (tag_bit) - iter->tags |= node->tags[tag][tag_long + 1] << - (BITS_PER_LONG - tag_bit); - /* Clip chunk size, here only BITS_PER_LONG tags */ - iter->next_index = index + BITS_PER_LONG; - } - } + if (flags & RADIX_TREE_ITER_TAGGED) + set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** - * radix_tree_range_tag_if_tagged - for each item in given range set given - * tag if item has another tag set - * @root: radix tree root - * @first_indexp: pointer to a starting index of a range to scan - * @last_index: last index of a range to scan - * @nr_to_tag: maximum number items to tag - * @iftag: tag index to test - * @settag: tag index to set if tested tag is set - * - * This function scans range of radix tree from first_index to last_index - * (inclusive). For each item in the range if iftag is set, the function sets - * also settag. The function stops either after tagging nr_to_tag items or - * after reaching last_index. - * - * The tags must be set from the leaf level only and propagated back up the - * path to the root. We must do this so that we resolve the full path before - * setting any tags on intermediate nodes. If we set tags as we descend, then - * we can get to the leaf node and find that the index that has the iftag - * set is outside the range we are scanning. This reults in dangling tags and - * can lead to problems with later tag operations (e.g. livelocks on lookups). - * - * The function returns the number of leaves where the tag was set and sets - * *first_indexp to the first unscanned index. - * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must - * be prepared to handle that. - */ -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int iftag, unsigned int settag) -{ - struct radix_tree_node *parent, *node, *child; - unsigned long maxindex; - unsigned long tagged = 0; - unsigned long index = *first_indexp; - - radix_tree_load_root(root, &child, &maxindex); - last_index = min(last_index, maxindex); - if (index > last_index) - return 0; - if (!nr_to_tag) - return 0; - if (!root_tag_get(root, iftag)) { - *first_indexp = last_index + 1; - return 0; - } - if (!radix_tree_is_internal_node(child)) { - *first_indexp = last_index + 1; - root_tag_set(root, settag); - return 1; - } - - node = entry_to_node(child); - - for (;;) { - unsigned offset = radix_tree_descend(node, &child, index); - if (!child) - goto next; - if (!tag_get(node, iftag, offset)) - goto next; - /* Sibling slots never have tags set on them */ - if (radix_tree_is_internal_node(child)) { - node = entry_to_node(child); - continue; - } - - /* tag the leaf */ - tagged++; - tag_set(node, settag, offset); - - /* walk back up the path tagging interior nodes */ - parent = node; - for (;;) { - offset = parent->offset; - parent = parent->parent; - if (!parent) - break; - /* stop if we find a node with the tag already set */ - if (tag_get(parent, settag, offset)) - break; - tag_set(parent, settag, offset); - } - next: - /* Go to next entry in node */ - index = ((index >> node->shift) + 1) << node->shift; - /* Overflow can happen when last_index is ~0UL... */ - if (index > last_index || !index) - break; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - while (offset == 0) { - /* - * We've fully scanned this node. Go up. Because - * last_index is guaranteed to be in the tree, what - * we do below cannot wander astray. - */ - node = node->parent; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - } - if (is_sibling_entry(node, node->slots[offset])) - goto next; - if (tagged >= nr_to_tag) - break; - } - /* - * We need not to tag the root tag if there is no tag which is set with - * settag within the range from *first_indexp to last_index. - */ - if (tagged > 0) - root_tag_set(root, settag); - *first_indexp = index; - - return tagged; -} -EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - -/** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed @@ -1294,229 +1823,23 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) -#include <linux/sched.h> /* for cond_resched() */ - -struct locate_info { - unsigned long found_index; - bool stop; -}; - -/* - * This linear search is at present only useful to shmem_unuse_inode(). - */ -static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, struct locate_info *info) -{ - unsigned long i; - - do { - unsigned int shift = slot->shift; - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK; - i < RADIX_TREE_MAP_SIZE; - i++, index += (1UL << shift)) { - struct radix_tree_node *node = - rcu_dereference_raw(slot->slots[i]); - if (node == RADIX_TREE_RETRY) - goto out; - if (!radix_tree_is_internal_node(node)) { - if (node == item) { - info->found_index = index; - info->stop = true; - goto out; - } - continue; - } - node = entry_to_node(node); - if (is_sibling_entry(slot, node)) - continue; - slot = node; - break; - } - } while (i < RADIX_TREE_MAP_SIZE); - -out: - if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) - info->stop = true; - return index; -} - -/** - * radix_tree_locate_item - search through radix tree for item - * @root: radix tree root - * @item: item to be found - * - * Returns index where item was found, or -1 if not found. - * Caller must hold no lock (since this time-consuming function needs - * to be preemptible), and must check afterwards if item is still there. - */ -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = 0; - struct locate_info info = { - .found_index = -1, - .stop = false, - }; - - do { - rcu_read_lock(); - node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_internal_node(node)) { - rcu_read_unlock(); - if (node == item) - info.found_index = 0; - break; - } - - node = entry_to_node(node); - - max_index = node_maxindex(node); - if (cur_index > max_index) { - rcu_read_unlock(); - break; - } - - cur_index = __locate(node, item, cur_index, &info); - rcu_read_unlock(); - cond_resched(); - } while (!info.stop && cur_index <= max_index); - - return info.found_index; -} -#else -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - return -1; -} -#endif /* CONFIG_SHMEM && CONFIG_SWAP */ - -/** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @node: node containing @index + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) -{ - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; -} - -static inline void delete_sibling_entries(struct radix_tree_node *node, - void *ptr, unsigned offset) +void __radix_tree_delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, + void *private) { -#ifdef CONFIG_RADIX_TREE_MULTIORDER - int i; - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr) - break; - node->slots[offset + i] = NULL; - node->count--; - } -#endif + delete_node(root, node, update_node, private); } /** @@ -1558,11 +1881,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); - delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } @@ -1583,15 +1902,10 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -struct radix_tree_node *radix_tree_replace_clear_tags( - struct radix_tree_root *root, - unsigned long index, void *entry) +void radix_tree_clear_tags(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(root, index, &node, &slot); - if (node) { unsigned int tag, offset = get_slot_offset(node, slot); for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) @@ -1600,9 +1914,6 @@ struct radix_tree_node *radix_tree_replace_clear_tags( /* Clear root node tags */ root->gfp_mask &= __GFP_BITS_MASK; } - - radix_tree_replace_slot(slot, entry); - return node; } /** @@ -1650,32 +1961,31 @@ static __init void radix_tree_init_maxnodes(void) } } -static int radix_tree_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int radix_tree_cpu_dead(unsigned int cpu) { - int cpu = (long)hcpu; struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - rtp = &per_cpu(radix_tree_preloads, cpu); - while (rtp->nr) { - node = rtp->nodes; - rtp->nodes = node->private_data; - kmem_cache_free(radix_tree_node_cachep, node); - rtp->nr--; - } + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { + node = rtp->nodes; + rtp->nodes = node->private_data; + kmem_cache_free(radix_tree_node_cachep, node); + rtp->nr--; } - return NOTIFY_OK; + return 0; } void __init radix_tree_init(void) { + int ret; radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); radix_tree_init_maxnodes(); - hotcpu_notifier(radix_tree_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", + NULL, radix_tree_cpu_dead); + WARN_ON(ret < 0); } diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 29f503ebfd60..3057011f5599 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_RAID6_PQ) += raid6_pq.o raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ int8.o int16.o int32.o -raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o raid6_pq-$(CONFIG_TILEGX) += tilegx8.o diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 592ff49df47d..7857049fd7d3 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -49,6 +49,10 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x1, &raid6_avx2x2, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, +#endif #endif #if defined(__x86_64__) && !defined(__arch_um__) &raid6_sse2x1, @@ -59,6 +63,11 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_avx2x2, &raid6_avx2x4, #endif +#ifdef CONFIG_AS_AVX512 + &raid6_avx512x1, + &raid6_avx512x2, + &raid6_avx512x4, +#endif #endif #ifdef CONFIG_ALTIVEC &raid6_altivec1, @@ -92,6 +101,9 @@ void (*raid6_datap_recov)(int, size_t, int, void **); EXPORT_SYMBOL_GPL(raid6_datap_recov); const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_AS_AVX512 + &raid6_recov_avx512, +#endif #ifdef CONFIG_AS_AVX2 &raid6_recov_avx2, #endif diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index 76734004358d..20bca3d44f67 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 32) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x1 = { raid6_avx21_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx21_xor_syndrome, raid6_have_avx2, "avx2x1", 1 /* Has cache hints */ @@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x2 = { raid6_avx22_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx22_xor_syndrome, raid6_have_avx2, "avx2x2", 1 /* Has cache hints */ @@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); + asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); + asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + asm volatile("vpxor %ymm12,%ymm10,%ymm10"); + asm volatile("vpxor %ymm14,%ymm11,%ymm11"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" + :: "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" + :: "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+64])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); + asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x4 = { raid6_avx24_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx24_xor_syndrome, raid6_have_avx2, "avx2x4", 1 /* Has cache hints */ diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c new file mode 100644 index 000000000000..f524a7972006 --- /dev/null +++ b/lib/raid6/avx512.c @@ -0,0 +1,569 @@ +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX512 */ diff --git a/lib/raid6/recov_avx512.c b/lib/raid6/recov_avx512.c new file mode 100644 index 000000000000..625aafa33b61 --- /dev/null +++ b/lib/raid6/recov_avx512.c @@ -0,0 +1,388 @@ +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#ifdef CONFIG_AS_AVX512 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; + +#else +#warning "your version of binutils lacks AVX512 support" +#endif diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 29090f3db677..2c7b60edea04 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -32,10 +32,13 @@ ifeq ($(ARCH),arm64) endif ifeq ($(IS_X86),yes) - OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ gcc -c -x assembler - >&/dev/null && \ rm ./-.o && echo -DCONFIG_AS_AVX2=1) + CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX512=1) else ifeq ($(HAS_NEON),yes) OBJS += neon.o neon1.o neon2.o neon4.o neon8.o CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c index 3bebbabdb510..b07f4d8e6b03 100644 --- a/lib/raid6/test/test.c +++ b/lib/raid6/test/test.c @@ -21,12 +21,13 @@ #define NDISKS 16 /* Including P and Q */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); struct raid6_calls raid6_call; char *dataptrs[NDISKS]; -char data[NDISKS][PAGE_SIZE]; -char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; +char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); static void makedata(int start, int stop) { diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h index 8fe9d9662abb..834d268a4b05 100644 --- a/lib/raid6/x86.h +++ b/lib/raid6/x86.h @@ -46,6 +46,16 @@ static inline void kernel_fpu_end(void) #define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 DQ (Double/Quad granular) + * Instructions + */ +#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 BW (Byte/Word granular) + * Instructions + */ +#define X86_FEATURE_AVX512VL (9*32+31) /* AVX-512 VL (128/256 Vector Length) + * Extensions + */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ /* Should work well enough on modern CPUs for testing */ diff --git a/lib/random32.c b/lib/random32.c index 69ed593aab07..fa594b1140e6 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -47,7 +47,7 @@ static inline void prandom_state_selftest(void) } #endif -static DEFINE_PER_CPU(struct rnd_state, net_rand_state); +static DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy; /** * prandom_u32_state - seeded pseudo-random number generator. @@ -81,7 +81,7 @@ u32 prandom_u32(void) u32 res; res = prandom_u32_state(state); - put_cpu_var(state); + put_cpu_var(net_rand_state); return res; } @@ -128,7 +128,7 @@ void prandom_bytes(void *buf, size_t bytes) struct rnd_state *state = &get_cpu_var(net_rand_state); prandom_bytes_state(state, buf, bytes); - put_cpu_var(state); + put_cpu_var(net_rand_state); } EXPORT_SYMBOL(prandom_bytes); diff --git a/lib/rbtree.c b/lib/rbtree.c index eb8a19fee110..1f8b112a7c35 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); @@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ + /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); @@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ + /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 56054e541a0f..172454e6b979 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -32,6 +32,11 @@ #define HASH_MIN_SIZE 4U #define BUCKET_LOCKS_PER_CPU 32UL +union nested_table { + union nested_table __rcu *table; + struct rhash_head __rcu *bucket; +}; + static u32 head_hashfn(struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he) @@ -76,6 +81,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, /* Never allocate more than 0.5 locks per bucket */ size = min_t(unsigned int, size, tbl->size >> 1); + if (tbl->nest) + size = min(size, 1U << tbl->nest); + if (sizeof(spinlock_t) != 0) { tbl->locks = NULL; #ifdef CONFIG_NUMA @@ -99,8 +107,45 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, return 0; } +static void nested_table_free(union nested_table *ntbl, unsigned int size) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + const unsigned int len = 1 << shift; + unsigned int i; + + ntbl = rcu_dereference_raw(ntbl->table); + if (!ntbl) + return; + + if (size > len) { + size >>= shift; + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + } + + kfree(ntbl); +} + +static void nested_bucket_table_free(const struct bucket_table *tbl) +{ + unsigned int size = tbl->size >> tbl->nest; + unsigned int len = 1 << tbl->nest; + union nested_table *ntbl; + unsigned int i; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + + for (i = 0; i < len; i++) + nested_table_free(ntbl + i, size); + + kfree(ntbl); +} + static void bucket_table_free(const struct bucket_table *tbl) { + if (tbl->nest) + nested_bucket_table_free(tbl); + if (tbl) kvfree(tbl->locks); @@ -112,6 +157,59 @@ static void bucket_table_free_rcu(struct rcu_head *head) bucket_table_free(container_of(head, struct bucket_table, rcu)); } +static union nested_table *nested_table_alloc(struct rhashtable *ht, + union nested_table __rcu **prev, + unsigned int shifted, + unsigned int nhash) +{ + union nested_table *ntbl; + int i; + + ntbl = rcu_dereference(*prev); + if (ntbl) + return ntbl; + + ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + + if (ntbl && shifted) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0].bucket); i++) + INIT_RHT_NULLS_HEAD(ntbl[i].bucket, ht, + (i << shifted) | nhash); + } + + rcu_assign_pointer(*prev, ntbl); + + return ntbl; +} + +static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + size_t nbuckets, + gfp_t gfp) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + struct bucket_table *tbl; + size_t size; + + if (nbuckets < (1 << (shift + 1))) + return NULL; + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + + tbl = kzalloc(size, gfp); + if (!tbl) + return NULL; + + if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets, + 0, 0)) { + kfree(tbl); + return NULL; + } + + tbl->nest = (ilog2(nbuckets) - 1) % shift + 1; + + return tbl; +} + static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t nbuckets, gfp_t gfp) @@ -126,10 +224,17 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); if (tbl == NULL && gfp == GFP_KERNEL) tbl = vzalloc(size); + + size = nbuckets; + + if (tbl == NULL && gfp != GFP_KERNEL) { + tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); + nbuckets = 0; + } if (tbl == NULL) return NULL; - tbl->size = nbuckets; + tbl->size = size; if (alloc_bucket_locks(ht, tbl, gfp) < 0) { bucket_table_free(tbl); @@ -164,12 +269,17 @@ static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash) struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); struct bucket_table *new_tbl = rhashtable_last_table(ht, rht_dereference_rcu(old_tbl->future_tbl, ht)); - struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash]; - int err = -ENOENT; + struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash); + int err = -EAGAIN; struct rhash_head *head, *next, *entry; spinlock_t *new_bucket_lock; unsigned int new_hash; + if (new_tbl->nest) + goto out; + + err = -ENOENT; + rht_for_each(entry, old_tbl, old_hash) { err = 0; next = rht_dereference_bucket(entry->next, old_tbl, old_hash); @@ -202,19 +312,26 @@ out: return err; } -static void rhashtable_rehash_chain(struct rhashtable *ht, +static int rhashtable_rehash_chain(struct rhashtable *ht, unsigned int old_hash) { struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); spinlock_t *old_bucket_lock; + int err; old_bucket_lock = rht_bucket_lock(old_tbl, old_hash); spin_lock_bh(old_bucket_lock); - while (!rhashtable_rehash_one(ht, old_hash)) + while (!(err = rhashtable_rehash_one(ht, old_hash))) ; - old_tbl->rehash++; + + if (err == -ENOENT) { + old_tbl->rehash++; + err = 0; + } spin_unlock_bh(old_bucket_lock); + + return err; } static int rhashtable_rehash_attach(struct rhashtable *ht, @@ -246,13 +363,17 @@ static int rhashtable_rehash_table(struct rhashtable *ht) struct bucket_table *new_tbl; struct rhashtable_walker *walker; unsigned int old_hash; + int err; new_tbl = rht_dereference(old_tbl->future_tbl, ht); if (!new_tbl) return 0; - for (old_hash = 0; old_hash < old_tbl->size; old_hash++) - rhashtable_rehash_chain(ht, old_hash); + for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { + err = rhashtable_rehash_chain(ht, old_hash); + if (err) + return err; + } /* Publish the new table pointer. */ rcu_assign_pointer(ht->tbl, new_tbl); @@ -271,31 +392,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht) return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0; } -/** - * rhashtable_expand - Expand hash table while allowing concurrent lookups - * @ht: the hash table to expand - * - * A secondary bucket array is allocated and the hash entries are migrated. - * - * This function may only be called in a context where it is safe to call - * synchronize_rcu(), e.g. not within a rcu_read_lock() section. - * - * The caller must ensure that no concurrent resizing occurs by holding - * ht->mutex. - * - * It is valid to have concurrent insertions and deletions protected by per - * bucket locks or concurrent RCU protected lookups and traversals. - */ -static int rhashtable_expand(struct rhashtable *ht) +static int rhashtable_rehash_alloc(struct rhashtable *ht, + struct bucket_table *old_tbl, + unsigned int size) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *new_tbl; int err; ASSERT_RHT_MUTEX(ht); - old_tbl = rhashtable_last_table(ht, old_tbl); - - new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL); + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); if (new_tbl == NULL) return -ENOMEM; @@ -324,12 +430,9 @@ static int rhashtable_expand(struct rhashtable *ht) */ static int rhashtable_shrink(struct rhashtable *ht) { - struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); + struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht); unsigned int nelems = atomic_read(&ht->nelems); unsigned int size = 0; - int err; - - ASSERT_RHT_MUTEX(ht); if (nelems) size = roundup_pow_of_two(nelems * 3 / 2); @@ -342,15 +445,7 @@ static int rhashtable_shrink(struct rhashtable *ht) if (rht_dereference(old_tbl->future_tbl, ht)) return -EEXIST; - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) - return -ENOMEM; - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) - bucket_table_free(new_tbl); - - return err; + return rhashtable_rehash_alloc(ht, old_tbl, size); } static void rht_deferred_worker(struct work_struct *work) @@ -366,11 +461,14 @@ static void rht_deferred_worker(struct work_struct *work) tbl = rhashtable_last_table(ht, tbl); if (rht_grow_above_75(ht, tbl)) - rhashtable_expand(ht); + err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2); else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl)) - rhashtable_shrink(ht); + err = rhashtable_shrink(ht); + else if (tbl->nest) + err = rhashtable_rehash_alloc(ht, tbl, tbl->size); - err = rhashtable_rehash_table(ht); + if (!err) + err = rhashtable_rehash_table(ht); mutex_unlock(&ht->mutex); @@ -378,22 +476,8 @@ static void rht_deferred_worker(struct work_struct *work) schedule_work(&ht->run_work); } -static bool rhashtable_check_elasticity(struct rhashtable *ht, - struct bucket_table *tbl, - unsigned int hash) -{ - unsigned int elasticity = ht->elasticity; - struct rhash_head *head; - - rht_for_each(head, tbl, hash) - if (!--elasticity) - return true; - - return false; -} - -int rhashtable_insert_rehash(struct rhashtable *ht, - struct bucket_table *tbl) +static int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; @@ -439,61 +523,177 @@ fail: return err; } -EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +static void *rhashtable_lookup_one(struct rhashtable *ht, + struct bucket_table *tbl, unsigned int hash, + const void *key, struct rhash_head *obj) { + struct rhashtable_compare_arg arg = { + .ht = ht, + .key = key, + }; + struct rhash_head __rcu **pprev; struct rhash_head *head; - unsigned int hash; - int err; + int elasticity; - tbl = rhashtable_last_table(ht, tbl); - hash = head_hashfn(ht, tbl, obj); - spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING); + elasticity = ht->elasticity; + pprev = rht_bucket_var(tbl, hash); + rht_for_each_continue(head, *pprev, tbl, hash) { + struct rhlist_head *list; + struct rhlist_head *plist; - err = -EEXIST; - if (key && rhashtable_lookup_fast(ht, key, ht->p)) - goto exit; + elasticity--; + if (!key || + (ht->p.obj_cmpfn ? + ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; + + if (!ht->rhlist) + return rht_obj(ht, head); + + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); + + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); + + return NULL; + } + + if (elasticity <= 0) + return ERR_PTR(-EAGAIN); + + return ERR_PTR(-ENOENT); +} + +static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash, + struct rhash_head *obj, + void *data) +{ + struct rhash_head __rcu **pprev; + struct bucket_table *new_tbl; + struct rhash_head *head; + + if (!IS_ERR_OR_NULL(data)) + return ERR_PTR(-EEXIST); + + if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); + + new_tbl = rcu_dereference(tbl->future_tbl); + if (new_tbl) + return new_tbl; + + if (PTR_ERR(data) != -ENOENT) + return ERR_CAST(data); - err = -E2BIG; if (unlikely(rht_grow_above_max(ht, tbl))) - goto exit; + return ERR_PTR(-E2BIG); - err = -EAGAIN; - if (rhashtable_check_elasticity(ht, tbl, hash) || - rht_grow_above_100(ht, tbl)) - goto exit; + if (unlikely(rht_grow_above_100(ht, tbl))) + return ERR_PTR(-EAGAIN); - err = 0; + pprev = rht_bucket_insert(ht, tbl, hash); + if (!pprev) + return ERR_PTR(-ENOMEM); - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); + head = rht_dereference_bucket(*pprev, tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (ht->rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } - rcu_assign_pointer(tbl->buckets[hash], obj); + rcu_assign_pointer(*pprev, obj); atomic_inc(&ht->nelems); + if (rht_grow_above_75(ht, tbl)) + schedule_work(&ht->run_work); -exit: - spin_unlock(rht_bucket_lock(tbl, hash)); + return NULL; +} - if (err == 0) - return NULL; - else if (err == -EAGAIN) - return tbl; - else - return ERR_PTR(err); +static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + struct bucket_table *new_tbl; + struct bucket_table *tbl; + unsigned int hash; + spinlock_t *lock; + void *data; + + tbl = rcu_dereference(ht->tbl); + + /* All insertions must grab the oldest table containing + * the hashed bucket that is yet to be rehashed. + */ + for (;;) { + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); + + if (tbl->rehash <= hash) + break; + + spin_unlock_bh(lock); + tbl = rcu_dereference(tbl->future_tbl); + } + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + while (!IS_ERR_OR_NULL(new_tbl)) { + tbl = new_tbl; + hash = rht_head_hashfn(ht, tbl, obj, ht->p); + spin_lock_nested(rht_bucket_lock(tbl, hash), + SINGLE_DEPTH_NESTING); + + data = rhashtable_lookup_one(ht, tbl, hash, key, obj); + new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data); + if (PTR_ERR(new_tbl) != -EEXIST) + data = ERR_CAST(new_tbl); + + spin_unlock(rht_bucket_lock(tbl, hash)); + } + + spin_unlock_bh(lock); + + if (PTR_ERR(data) == -EAGAIN) + data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?: + -EAGAIN); + + return data; +} + +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj) +{ + void *data; + + do { + rcu_read_lock(); + data = rhashtable_try_insert(ht, key, obj); + rcu_read_unlock(); + } while (PTR_ERR(data) == -EAGAIN); + + return data; } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); /** - * rhashtable_walk_init - Initialise an iterator + * rhashtable_walk_enter - Initialise an iterator * @ht: Table to walk over * @iter: Hash table Iterator - * @gfp: GFP flags for allocations * * This function prepares a hash table walk. * @@ -508,30 +708,22 @@ EXPORT_SYMBOL_GPL(rhashtable_insert_slow); * This function may sleep so you must not call it from interrupt * context or with spin locks held. * - * You must call rhashtable_walk_exit if this function returns - * successfully. + * You must call rhashtable_walk_exit after this function returns. */ -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp) +void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter) { iter->ht = ht; iter->p = NULL; iter->slot = 0; iter->skip = 0; - iter->walker = kmalloc(sizeof(*iter->walker), gfp); - if (!iter->walker) - return -ENOMEM; - spin_lock(&ht->lock); - iter->walker->tbl = + iter->walker.tbl = rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock)); - list_add(&iter->walker->list, &iter->walker->tbl->walkers); + list_add(&iter->walker.list, &iter->walker.tbl->walkers); spin_unlock(&ht->lock); - - return 0; } -EXPORT_SYMBOL_GPL(rhashtable_walk_init); +EXPORT_SYMBOL_GPL(rhashtable_walk_enter); /** * rhashtable_walk_exit - Free an iterator @@ -542,10 +734,9 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_init); void rhashtable_walk_exit(struct rhashtable_iter *iter) { spin_lock(&iter->ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&iter->ht->lock); - kfree(iter->walker); } EXPORT_SYMBOL_GPL(rhashtable_walk_exit); @@ -571,12 +762,12 @@ int rhashtable_walk_start(struct rhashtable_iter *iter) rcu_read_lock(); spin_lock(&ht->lock); - if (iter->walker->tbl) - list_del(&iter->walker->list); + if (iter->walker.tbl) + list_del(&iter->walker.list); spin_unlock(&ht->lock); - if (!iter->walker->tbl) { - iter->walker->tbl = rht_dereference_rcu(ht->tbl, ht); + if (!iter->walker.tbl) { + iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht); return -EAGAIN; } @@ -598,12 +789,17 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_start); */ void *rhashtable_walk_next(struct rhashtable_iter *iter) { - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; + struct rhlist_head *list = iter->list; struct rhashtable *ht = iter->ht; struct rhash_head *p = iter->p; + bool rhlist = ht->rhlist; if (p) { - p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); + if (!rhlist || !(list = rcu_dereference(list->next))) { + p = rcu_dereference(p->next); + list = container_of(p, struct rhlist_head, rhead); + } goto next; } @@ -611,6 +807,18 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter) int skip = iter->skip; rht_for_each_rcu(p, tbl, iter->slot) { + if (rhlist) { + list = container_of(p, struct rhlist_head, + rhead); + do { + if (!skip) + goto next; + skip--; + list = rcu_dereference(list->next); + } while (list); + + continue; + } if (!skip) break; skip--; @@ -620,7 +828,8 @@ next: if (!rht_is_a_nulls(p)) { iter->skip++; iter->p = p; - return rht_obj(ht, p); + iter->list = list; + return rht_obj(ht, rhlist ? &list->rhead : p); } iter->skip = 0; @@ -631,8 +840,8 @@ next: /* Ensure we see any new tables. */ smp_rmb(); - iter->walker->tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (iter->walker->tbl) { + iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (iter->walker.tbl) { iter->slot = 0; iter->skip = 0; return ERR_PTR(-EAGAIN); @@ -652,7 +861,7 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU) { struct rhashtable *ht; - struct bucket_table *tbl = iter->walker->tbl; + struct bucket_table *tbl = iter->walker.tbl; if (!tbl) goto out; @@ -661,9 +870,9 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter) spin_lock(&ht->lock); if (tbl->rehash < tbl->size) - list_add(&iter->walker->list, &tbl->walkers); + list_add(&iter->walker.list, &tbl->walkers); else - iter->walker->tbl = NULL; + iter->walker.tbl = NULL; spin_unlock(&ht->lock); iter->p = NULL; @@ -809,6 +1018,48 @@ int rhashtable_init(struct rhashtable *ht, EXPORT_SYMBOL_GPL(rhashtable_init); /** + * rhltable_init - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * + * See documentation for rhashtable_init. + */ +int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +{ + int err; + + /* No rhlist NULLs marking for now. */ + if (params->nulls_base) + return -EINVAL; + + err = rhashtable_init(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; +} +EXPORT_SYMBOL_GPL(rhltable_init); + +static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), + void *arg) +{ + struct rhlist_head *list; + + if (!ht->rhlist) { + free_fn(rht_obj(ht, obj), arg); + return; + } + + list = container_of(obj, struct rhlist_head, rhead); + do { + obj = &list->rhead; + list = rht_dereference(list->next, ht); + free_fn(rht_obj(ht, obj), arg); + } while (list); +} + +/** * rhashtable_free_and_destroy - free elements and destroy hash table * @ht: the hash table to destroy * @free_fn: callback to release resources of element @@ -827,7 +1078,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg) { - const struct bucket_table *tbl; + struct bucket_table *tbl; unsigned int i; cancel_work_sync(&ht->run_work); @@ -838,14 +1089,14 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; - for (pos = rht_dereference(tbl->buckets[i], ht), + for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; !rht_is_a_nulls(pos); pos = next, next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL) - free_fn(rht_obj(ht, pos), arg); + rhashtable_free_one(ht, pos, free_fn, arg); } } @@ -859,3 +1110,70 @@ void rhashtable_destroy(struct rhashtable *ht) return rhashtable_free_and_destroy(ht, NULL, NULL); } EXPORT_SYMBOL_GPL(rhashtable_destroy); + +struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + static struct rhash_head __rcu *rhnull = + (struct rhash_head __rcu *)NULLS_MARKER(0); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + unsigned int subhash = hash; + union nested_table *ntbl; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + subhash >>= tbl->nest; + + while (ntbl && size > (1 << shift)) { + index = subhash & ((1 << shift) - 1); + ntbl = rht_dereference_bucket(ntbl[index].table, tbl, hash); + size >>= shift; + subhash >>= shift; + } + + if (!ntbl) + return &rhnull; + + return &ntbl[subhash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested); + +struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash) +{ + const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *)); + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; + unsigned int shifted; + unsigned int nhash; + + ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]); + hash >>= tbl->nest; + nhash = index; + shifted = tbl->nest; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, nhash); + + while (ntbl && size > (1 << shift)) { + index = hash & ((1 << shift) - 1); + size >>= shift; + hash >>= shift; + nhash |= index << shifted; + shifted += shift; + ntbl = nested_table_alloc(ht, &ntbl[index].table, + size <= (1 << shift) ? shifted : 0, + nhash); + } + + if (!ntbl) + return NULL; + + return &ntbl[hash].bucket; + +} +EXPORT_SYMBOL_GPL(rht_bucket_nested_insert); diff --git a/lib/sbitmap.c b/lib/sbitmap.c new file mode 100644 index 000000000000..55e11c4b2f3b --- /dev/null +++ b/lib/sbitmap.c @@ -0,0 +1,470 @@ +/* + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/random.h> +#include <linux/sbitmap.h> +#include <linux/seq_file.h> + +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node) +{ + unsigned int bits_per_word; + unsigned int i; + + if (shift < 0) { + shift = ilog2(BITS_PER_LONG); + /* + * If the bitmap is small, shrink the number of bits per word so + * we spread over a few cachelines, at least. If less than 4 + * bits, just forget about it, it's not going to work optimally + * anyway. + */ + if (depth >= 4) { + while ((4U << shift) > depth) + shift--; + } + } + bits_per_word = 1U << shift; + if (bits_per_word > BITS_PER_LONG) + return -EINVAL; + + sb->shift = shift; + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + if (depth == 0) { + sb->map = NULL; + return 0; + } + + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); + if (!sb->map) + return -ENOMEM; + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_init_node); + +void sbitmap_resize(struct sbitmap *sb, unsigned int depth) +{ + unsigned int bits_per_word = 1U << sb->shift; + unsigned int i; + + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } +} +EXPORT_SYMBOL_GPL(sbitmap_resize); + +static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, + bool wrap) +{ + unsigned int orig_hint = hint; + int nr; + + while (1) { + nr = find_next_zero_bit(&word->word, word->depth, hint); + if (unlikely(nr >= word->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (orig_hint && hint && wrap) { + hint = orig_hint = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(nr, &word->word)) + break; + + hint = nr + 1; + if (hint >= word->depth - 1) + hint = 0; + } + + return nr; +} + +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index], + SB_NR_TO_BIT(sb, alloc_hint), + !round_robin); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get); + +bool sbitmap_any_bit_set(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + if (sb->map[i].word) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); + +bool sbitmap_any_bit_clear(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + unsigned long ret; + + ret = find_first_zero_bit(&word->word, word->depth); + if (ret < word->depth) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + unsigned int i, weight = 0; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + + weight += bitmap_weight(&word->word, word->depth); + } + return weight; +} +EXPORT_SYMBOL_GPL(sbitmap_weight); + +void sbitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + seq_printf(m, "depth=%u\n", sb->depth); + seq_printf(m, "busy=%u\n", sbitmap_weight(sb)); + seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); + seq_printf(m, "map_nr=%u\n", sb->map_nr); +} +EXPORT_SYMBOL_GPL(sbitmap_show); + +static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) +{ + if ((offset & 0xf) == 0) { + if (offset != 0) + seq_putc(m, '\n'); + seq_printf(m, "%08x:", offset); + } + if ((offset & 0x1) == 0) + seq_putc(m, ' '); + seq_printf(m, "%02x", byte); +} + +void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) +{ + u8 byte = 0; + unsigned int byte_bits = 0; + unsigned int offset = 0; + int i; + + for (i = 0; i < sb->map_nr; i++) { + unsigned long word = READ_ONCE(sb->map[i].word); + unsigned int word_bits = READ_ONCE(sb->map[i].depth); + + while (word_bits > 0) { + unsigned int bits = min(8 - byte_bits, word_bits); + + byte |= (word & (BIT(bits) - 1)) << byte_bits; + byte_bits += bits; + if (byte_bits == 8) { + emit_byte(m, offset, byte); + byte = 0; + byte_bits = 0; + offset++; + } + word >>= bits; + word_bits -= bits; + } + } + if (byte_bits) { + emit_byte(m, offset, byte); + offset++; + } + if (offset) + seq_putc(m, '\n'); +} +EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); + +static unsigned int sbq_calc_wake_batch(unsigned int depth) +{ + unsigned int wake_batch; + + /* + * For each batch, we wake up one queue. We need to make sure that our + * batch size is small enough that the full depth of the bitmap is + * enough to wake up all of the queues. + */ + wake_batch = SBQ_WAKE_BATCH; + if (wake_batch > depth / SBQ_WAIT_QUEUES) + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + + return wake_batch; +} + +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, bool round_robin, gfp_t flags, int node) +{ + int ret; + int i; + + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); + if (ret) + return ret; + + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); + if (!sbq->alloc_hint) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + if (depth && !round_robin) { + for_each_possible_cpu(i) + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; + } + + sbq->wake_batch = sbq_calc_wake_batch(depth); + atomic_set(&sbq->wake_index, 0); + + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); + if (!sbq->ws) { + free_percpu(sbq->alloc_hint); + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + init_waitqueue_head(&sbq->ws[i].wait); + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); + } + + sbq->round_robin = round_robin; + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + unsigned int wake_batch = sbq_calc_wake_batch(depth); + int i; + + if (sbq->wake_batch != wake_batch) { + WRITE_ONCE(sbq->wake_batch, wake_batch); + /* + * Pairs with the memory barrier in sbq_wake_up() to ensure that + * the batch size is updated before the wait counts. + */ + smp_mb__before_atomic(); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) + atomic_set(&sbq->ws[i].wait_cnt, 1); + } + sbitmap_resize(&sbq->sb, depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_resize); + +int __sbitmap_queue_get(struct sbitmap_queue *sbq) +{ + unsigned int hint, depth; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(sbq->round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get); + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) { + int o = atomic_read(&sbq->wake_index); + + if (wake_index != o) + atomic_cmpxchg(&sbq->wake_index, o, wake_index); + return ws; + } + + wake_index = sbq_index_inc(wake_index); + } + + return NULL; +} + +static void sbq_wake_up(struct sbitmap_queue *sbq) +{ + struct sbq_wait_state *ws; + unsigned int wake_batch; + int wait_cnt; + + /* + * Pairs with the memory barrier in set_current_state() to ensure the + * proper ordering of clear_bit()/waitqueue_active() in the waker and + * test_and_set_bit()/prepare_to_wait()/finish_wait() in the waiter. See + * the comment on waitqueue_active(). This is __after_atomic because we + * just did clear_bit() in the caller. + */ + smp_mb__after_atomic(); + + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + + wait_cnt = atomic_dec_return(&ws->wait_cnt); + if (wait_cnt <= 0) { + wake_batch = READ_ONCE(sbq->wake_batch); + /* + * Pairs with the memory barrier in sbitmap_queue_resize() to + * ensure that we see the batch size update before the wait + * count is reset. + */ + smp_mb__before_atomic(); + /* + * If there are concurrent callers to sbq_wake_up(), the last + * one to decrement the wait count below zero will bump it back + * up. If there is a concurrent resize, the count reset will + * either cause the cmpxchg to fail or overwrite after the + * cmpxchg. + */ + atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wait_cnt + wake_batch); + sbq_index_atomic_inc(&sbq->wake_index); + wake_up(&ws->wait); + } +} + +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + unsigned int cpu) +{ + sbitmap_clear_bit(&sbq->sb, nr); + sbq_wake_up(sbq); + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_clear); + +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + /* + * Pairs with the memory barrier in set_current_state() like in + * sbq_wake_up(). + */ + smp_mb(); + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) + wake_up(&ws->wait); + + wake_index = sbq_index_inc(wake_index); + } +} +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); + +void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) +{ + bool first; + int i; + + sbitmap_show(&sbq->sb, m); + + seq_puts(m, "alloc_hint={"); + first = true; + for_each_possible_cpu(i) { + if (!first) + seq_puts(m, ", "); + first = false; + seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i)); + } + seq_puts(m, "}\n"); + + seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); + seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); + + seq_puts(m, "ws={\n"); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[i]; + + seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", + atomic_read(&ws->wait_cnt), + waitqueue_active(&ws->wait) ? "active" : "inactive"); + } + seq_puts(m, "}\n"); + + seq_printf(m, "round_robin=%d\n", sbq->round_robin); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_show); diff --git a/lib/show_mem.c b/lib/show_mem.c index 1feed6a2b12a..0beaa1d899aa 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -9,13 +9,13 @@ #include <linux/quicklist.h> #include <linux/cma.h> -void show_mem(unsigned int filter) +void show_mem(unsigned int filter, nodemask_t *nodemask) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter); + show_free_areas(filter, nodemask); for_each_online_pgdat(pgdat) { unsigned long flags; diff --git a/lib/siphash.c b/lib/siphash.c new file mode 100644 index 000000000000..3ae58b4edad6 --- /dev/null +++ b/lib/siphash.c @@ -0,0 +1,551 @@ +/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. + * + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * + * This implementation is specifically for SipHash2-4 for a secure PRF + * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for + * hashtables. + */ + +#include <linux/siphash.h> +#include <asm/unaligned.h> + +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 +#include <linux/dcache.h> +#include <asm/word-at-a-time.h> +#endif + +#define SIPROUND \ + do { \ + v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \ + v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \ + v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \ + v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \ + } while (0) + +#define PREAMBLE(len) \ + u64 v0 = 0x736f6d6570736575ULL; \ + u64 v1 = 0x646f72616e646f6dULL; \ + u64 v2 = 0x6c7967656e657261ULL; \ + u64 v3 = 0x7465646279746573ULL; \ + u64 b = ((u64)(len)) << 56; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define POSTAMBLE \ + v3 ^= b; \ + SIPROUND; \ + SIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; + case 6: b |= ((u64)end[5]) << 40; + case 5: b |= ((u64)end[4]) << 32; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; + case 6: b |= ((u64)end[5]) << 40; + case 5: b |= ((u64)end[4]) << 32; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_unaligned); +#endif + +/** + * siphash_1u64 - compute 64-bit siphash PRF value of a u64 + * @first: first u64 + * @key: the siphash key + */ +u64 siphash_1u64(const u64 first, const siphash_key_t *key) +{ + PREAMBLE(8) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u64); + +/** + * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 + * @first: first u64 + * @second: second u64 + * @key: the siphash key + */ +u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) +{ + PREAMBLE(16) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_2u64); + +/** + * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @key: the siphash key + */ +u64 siphash_3u64(const u64 first, const u64 second, const u64 third, + const siphash_key_t *key) +{ + PREAMBLE(24) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u64); + +/** + * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @forth: forth u64 + * @key: the siphash key + */ +u64 siphash_4u64(const u64 first, const u64 second, const u64 third, + const u64 forth, const siphash_key_t *key) +{ + PREAMBLE(32) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + v3 ^= forth; + SIPROUND; + SIPROUND; + v0 ^= forth; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_4u64); + +u64 siphash_1u32(const u32 first, const siphash_key_t *key) +{ + PREAMBLE(4) + b |= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u32); + +u64 siphash_3u32(const u32 first, const u32 second, const u32 third, + const siphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + PREAMBLE(12) + v3 ^= combined; + SIPROUND; + SIPROUND; + v0 ^= combined; + b |= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u32); + +#if BITS_PER_LONG == 64 +/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for + * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. + */ + +#define HSIPROUND SIPROUND +#define HPREAMBLE(len) PREAMBLE(len) +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; + case 6: b |= ((u64)end[5]) << 40; + case 5: b |= ((u64)end[4]) << 32; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; + case 6: b |= ((u64)end[5]) << 40; + case 5: b |= ((u64)end[4]) << 32; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + b |= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(8) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(12) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + b |= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(16) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + combined = (u64)forth << 32 | third; + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#else +#define HSIPROUND \ + do { \ + v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \ + v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \ + v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \ + v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \ + } while (0) + +#define HPREAMBLE(len) \ + u32 v0 = 0; \ + u32 v1 = 0; \ + u32 v2 = 0x6c796765U; \ + u32 v3 = 0x74656462U; \ + u32 b = ((u32)(len)) << 24; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return v1 ^ v3; + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = le32_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = get_unaligned_le32(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + v3 ^= first; + HSIPROUND; + v0 ^= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + HPREAMBLE(8) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + HPREAMBLE(12) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + HPREAMBLE(16) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + v3 ^= forth; + HSIPROUND; + v0 ^= forth; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#endif diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 60f77f1d470a..f87d138e9672 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -50,7 +50,7 @@ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) -#define STACK_ALLOC_SLABS_CAP 1024 +#define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ (1LL << (STACK_ALLOC_INDEX_BITS)) : STACK_ALLOC_SLABS_CAP) @@ -192,6 +192,7 @@ void depot_fetch_stack(depot_stack_handle_t handle, struct stack_trace *trace) trace->entries = stack->entries; trace->skip = 0; } +EXPORT_SYMBOL_GPL(depot_fetch_stack); /** * depot_save_stack - save stack in a stack depot. @@ -283,3 +284,4 @@ exit: fast_exit: return retval; } +EXPORT_SYMBOL_GPL(depot_save_stack); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 9c5fe8110413..7e35fc450c5b 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,6 +1,7 @@ #include <linux/compiler.h> #include <linux/export.h> #include <linux/kasan-checks.h> +#include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> @@ -111,6 +112,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) long retval; kasan_check_write(dst, count); + check_object_size(dst, count, false); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 22e13a0e19d7..a8d74a733a38 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -53,7 +53,7 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -int swiotlb_force; +enum swiotlb_force swiotlb_force; /* * Used to do a quick range check in swiotlb_tbl_unmap_single and @@ -83,6 +83,12 @@ static unsigned int *io_tlb_list; static unsigned int io_tlb_index; /* + * Max segment that we can provide which (if pages are contingous) will + * not be bounced (unless SWIOTLB_FORCE is set). + */ +unsigned int max_segment; + +/* * We need to save away the original address corresponding to a mapped entry * for the sync operations. */ @@ -106,8 +112,12 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; - if (!strcmp(str, "force")) - swiotlb_force = 1; + if (!strcmp(str, "force")) { + swiotlb_force = SWIOTLB_FORCE; + } else if (!strcmp(str, "noforce")) { + swiotlb_force = SWIOTLB_NO_FORCE; + io_tlb_nslabs = 1; + } return 0; } @@ -120,6 +130,20 @@ unsigned long swiotlb_nr_tbl(void) } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); +unsigned int swiotlb_max_segment(void) +{ + return max_segment; +} +EXPORT_SYMBOL_GPL(swiotlb_max_segment); + +void swiotlb_set_max_segment(unsigned int val) +{ + if (swiotlb_force == SWIOTLB_FORCE) + max_segment = 1; + else + max_segment = rounddown(val, PAGE_SIZE); +} + /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) unsigned long swiotlb_size_or_default(void) @@ -201,6 +225,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (verbose) swiotlb_print_info(); + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); return 0; } @@ -279,6 +304,7 @@ swiotlb_late_init_with_default_size(size_t default_size) rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); if (rc) free_pages((unsigned long)vstart, order); + return rc; } @@ -333,6 +359,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) late_alloc = 1; + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + return 0; cleanup4: @@ -347,6 +375,7 @@ cleanup2: io_tlb_end = 0; io_tlb_start = 0; io_tlb_nslabs = 0; + max_segment = 0; return -ENOMEM; } @@ -375,6 +404,7 @@ void __init swiotlb_free(void) PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; + max_segment = 0; } int is_swiotlb_buffer(phys_addr_t paddr) @@ -425,7 +455,8 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, phys_addr_t orig_addr, size_t size, - enum dma_data_direction dir) + enum dma_data_direction dir, + unsigned long attrs) { unsigned long flags; phys_addr_t tlb_addr; @@ -452,11 +483,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* - * For mappings greater than a page, we limit the stride (and - * hence alignment) to a page size. + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. */ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size > PAGE_SIZE) + if (size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else stride = 1; @@ -526,7 +557,8 @@ found: */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; @@ -539,18 +571,27 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); static phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir) + enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + dma_addr_t start_dma_addr; - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); + if (swiotlb_force == SWIOTLB_NO_FORCE) { + dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", + &phys); + return SWIOTLB_MAP_ERROR; + } + + start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, + dir, attrs); } /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + unsigned long attrs) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -561,6 +602,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * First, sync the memory before unmapping the entry */ if (orig_addr != INVALID_PHYS_ADDR && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); @@ -654,7 +696,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ - phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE); + phys_addr_t paddr = map_single(hwdev, 0, size, + DMA_FROM_DEVICE, 0); if (paddr == SWIOTLB_MAP_ERROR) goto err_warn; @@ -667,9 +710,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dma_mask, (unsigned long long)dev_addr); - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ + /* + * DMA_TO_DEVICE to avoid memcpy in unmap_single. + * The DMA_ATTR_SKIP_CPU_SYNC is optional. + */ swiotlb_tbl_unmap_single(hwdev, paddr, - size, DMA_TO_DEVICE); + size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); goto err_warn; } } @@ -698,8 +745,12 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (!is_swiotlb_buffer(paddr)) free_pages((unsigned long)vaddr, get_order(size)); else - /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ - swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE); + /* + * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -707,6 +758,9 @@ static void swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, int do_panic) { + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + /* * Ran out of IOMMU space for this operation. This is very bad. * Unfortunately the drivers cannot handle this operation properly. @@ -714,8 +768,8 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, * When the mapping is small enough return a static buffer to limit * the damage, or panic when the transfer is too big. */ - printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at " - "device %s\n", size, dev ? dev_name(dev) : "?"); + dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", + size); if (size <= io_tlb_overflow || !do_panic) return; @@ -749,13 +803,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && !swiotlb_force) + if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) return dev_addr; trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir); + map = map_single(dev, phys, size, dir, attrs); if (map == SWIOTLB_MAP_ERROR) { swiotlb_full(dev, size, dir, 1); return phys_to_dma(dev, io_tlb_overflow_buffer); @@ -764,12 +818,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, dev_addr = phys_to_dma(dev, map); /* Ensure that the address returned is DMA'ble */ - if (!dma_capable(dev, dev_addr, size)) { - swiotlb_tbl_unmap_single(dev, map, size, dir); - return phys_to_dma(dev, io_tlb_overflow_buffer); - } + if (dma_capable(dev, dev_addr, size)) + return dev_addr; - return dev_addr; + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + + return phys_to_dma(dev, io_tlb_overflow_buffer); } EXPORT_SYMBOL_GPL(swiotlb_map_page); @@ -782,14 +837,15 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * whatever the device wrote there. */ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); return; } @@ -809,7 +865,7 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - unmap_single(hwdev, dev_addr, size, dir); + unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -888,14 +944,15 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, phys_addr_t paddr = sg_phys(sg); dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - if (swiotlb_force || + if (swiotlb_force == SWIOTLB_FORCE || !dma_capable(hwdev, dev_addr, sg->length)) { phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir); + sg->length, dir, attrs); if (map == SWIOTLB_MAP_ERROR) { /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); + attrs |= DMA_ATTR_SKIP_CPU_SYNC; swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); sg_dma_len(sgl) = 0; @@ -910,14 +967,6 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, } EXPORT_SYMBOL(swiotlb_map_sg_attrs); -int -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0); -} -EXPORT_SYMBOL(swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -933,19 +982,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); - + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + attrs); } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); -void -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0); -} -EXPORT_SYMBOL(swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 93f45011a59d..0362da0b66c3 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4831,7 +4831,7 @@ static struct bpf_test tests[] = { { }, INTERNAL, { 0x34 }, - { { 1, 0xbef } }, + { { ETH_HLEN, 0xbef } }, .fill_helper = bpf_fill_ld_abs_vlan_push_pop, }, /* @@ -5485,6 +5485,7 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; + skb->vlan_proto = htons(ETH_P_IP); skb->dev = &dev; skb->dev->ifindex = SKB_DEV_IFINDEX; skb->dev->type = SKB_DEV_TYPE; diff --git a/lib/test_firmware.c b/lib/test_firmware.c index a3e8ec3fb1c5..09371b0a9baf 100644 --- a/lib/test_firmware.c +++ b/lib/test_firmware.c @@ -42,12 +42,6 @@ static const struct file_operations test_fw_fops = { .read = test_fw_misc_read, }; -static struct miscdevice test_fw_misc_device = { - .minor = MISC_DYNAMIC_MINOR, - .name = "test_firmware", - .fops = &test_fw_fops, -}; - static ssize_t trigger_request_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -132,39 +126,81 @@ out: } static DEVICE_ATTR_WO(trigger_async_request); -static int __init test_firmware_init(void) +static ssize_t trigger_custom_fallback_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { int rc; + char *name; - rc = misc_register(&test_fw_misc_device); + name = kstrndup(buf, count, GFP_KERNEL); + if (!name) + return -ENOSPC; + + pr_info("loading '%s' using custom fallback mechanism\n", name); + + mutex_lock(&test_fw_mutex); + release_firmware(test_firmware); + test_firmware = NULL; + rc = request_firmware_nowait(THIS_MODULE, FW_ACTION_NOHOTPLUG, name, + dev, GFP_KERNEL, NULL, + trigger_async_request_cb); if (rc) { - pr_err("could not register misc device: %d\n", rc); - return rc; + pr_info("async load of '%s' failed: %d\n", name, rc); + kfree(name); + goto out; } - rc = device_create_file(test_fw_misc_device.this_device, - &dev_attr_trigger_request); - if (rc) { - pr_err("could not create sysfs interface: %d\n", rc); - goto dereg; + /* Free 'name' ASAP, to test for race conditions */ + kfree(name); + + wait_for_completion(&async_fw_done); + + if (test_firmware) { + pr_info("loaded: %zu\n", test_firmware->size); + rc = count; + } else { + pr_err("failed to async load firmware\n"); + rc = -ENODEV; } - rc = device_create_file(test_fw_misc_device.this_device, - &dev_attr_trigger_async_request); +out: + mutex_unlock(&test_fw_mutex); + + return rc; +} +static DEVICE_ATTR_WO(trigger_custom_fallback); + +#define TEST_FW_DEV_ATTR(name) &dev_attr_##name.attr + +static struct attribute *test_dev_attrs[] = { + TEST_FW_DEV_ATTR(trigger_request), + TEST_FW_DEV_ATTR(trigger_async_request), + TEST_FW_DEV_ATTR(trigger_custom_fallback), + NULL, +}; + +ATTRIBUTE_GROUPS(test_dev); + +static struct miscdevice test_fw_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + .name = "test_firmware", + .fops = &test_fw_fops, + .groups = test_dev_groups, +}; + +static int __init test_firmware_init(void) +{ + int rc; + + rc = misc_register(&test_fw_misc_device); if (rc) { - pr_err("could not create async sysfs interface: %d\n", rc); - goto remove_file; + pr_err("could not register misc device: %d\n", rc); + return rc; } pr_warn("interface ready\n"); return 0; - -remove_file: - device_remove_file(test_fw_misc_device.this_device, - &dev_attr_trigger_async_request); -dereg: - misc_deregister(&test_fw_misc_device); - return rc; } module_init(test_firmware_init); @@ -172,10 +208,6 @@ module_init(test_firmware_init); static void __exit test_firmware_exit(void) { release_firmware(test_firmware); - device_remove_file(test_fw_misc_device.this_device, - &dev_attr_trigger_async_request); - device_remove_file(test_fw_misc_device.this_device, - &dev_attr_trigger_request); misc_deregister(&test_fw_misc_device); pr_warn("removed interface\n"); } diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 5e51872b3fc1..fbdf87920093 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -20,6 +20,11 @@ #include <linux/uaccess.h> #include <linux/module.h> +/* + * Note: test functions are marked noinline so that their names appear in + * reports. + */ + static noinline void __init kmalloc_oob_right(void) { char *ptr; @@ -411,6 +416,29 @@ static noinline void __init copy_user_test(void) kfree(kmem); } +static noinline void __init use_after_scope_test(void) +{ + volatile char *volatile p; + + pr_info("use-after-scope on int\n"); + { + int local = 0; + + p = (char *)&local; + } + p[0] = 1; + p[3] = 1; + + pr_info("use-after-scope on array\n"); + { + char local[1024] = {0}; + + p = local; + } + p[0] = 1; + p[1023] = 1; +} + static int __init kmalloc_tests_init(void) { kmalloc_oob_right(); @@ -436,6 +464,7 @@ static int __init kmalloc_tests_init(void) kasan_global_oob(); ksize_unpoisons_memory(); copy_user_test(); + use_after_scope_test(); return -EAGAIN; } diff --git a/lib/test_parman.c b/lib/test_parman.c new file mode 100644 index 000000000000..fe9f3a785804 --- /dev/null +++ b/lib/test_parman.c @@ -0,0 +1,395 @@ +/* + * lib/test_parman.c - Test module for parman + * Copyright (c) 2017 Mellanox Technologies. All rights reserved. + * Copyright (c) 2017 Jiri Pirko <jiri@mellanox.com> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/err.h> +#include <linux/random.h> +#include <linux/parman.h> + +#define TEST_PARMAN_PRIO_SHIFT 7 /* defines number of prios for testing */ +#define TEST_PARMAN_PRIO_COUNT BIT(TEST_PARMAN_PRIO_SHIFT) +#define TEST_PARMAN_PRIO_MASK (TEST_PARMAN_PRIO_COUNT - 1) + +#define TEST_PARMAN_ITEM_SHIFT 13 /* defines a total number + * of items for testing + */ +#define TEST_PARMAN_ITEM_COUNT BIT(TEST_PARMAN_ITEM_SHIFT) +#define TEST_PARMAN_ITEM_MASK (TEST_PARMAN_ITEM_COUNT - 1) + +#define TEST_PARMAN_BASE_SHIFT 8 +#define TEST_PARMAN_BASE_COUNT BIT(TEST_PARMAN_BASE_SHIFT) +#define TEST_PARMAN_RESIZE_STEP_SHIFT 7 +#define TEST_PARMAN_RESIZE_STEP_COUNT BIT(TEST_PARMAN_RESIZE_STEP_SHIFT) + +#define TEST_PARMAN_BULK_MAX_SHIFT (2 + TEST_PARMAN_RESIZE_STEP_SHIFT) +#define TEST_PARMAN_BULK_MAX_COUNT BIT(TEST_PARMAN_BULK_MAX_SHIFT) +#define TEST_PARMAN_BULK_MAX_MASK (TEST_PARMAN_BULK_MAX_COUNT - 1) + +#define TEST_PARMAN_RUN_BUDGET (TEST_PARMAN_ITEM_COUNT * 256) + +struct test_parman_prio { + struct parman_prio parman_prio; + unsigned long priority; +}; + +struct test_parman_item { + struct parman_item parman_item; + struct test_parman_prio *prio; + bool used; +}; + +struct test_parman { + struct parman *parman; + struct test_parman_item **prio_array; + unsigned long prio_array_limit; + struct test_parman_prio prios[TEST_PARMAN_PRIO_COUNT]; + struct test_parman_item items[TEST_PARMAN_ITEM_COUNT]; + struct rnd_state rnd; + unsigned long run_budget; + unsigned long bulk_budget; + bool bulk_noop; + unsigned int used_items; +}; + +#define ITEM_PTRS_SIZE(count) (sizeof(struct test_parman_item *) * (count)) + +static int test_parman_resize(void *priv, unsigned long new_count) +{ + struct test_parman *test_parman = priv; + struct test_parman_item **prio_array; + unsigned long old_count; + + prio_array = krealloc(test_parman->prio_array, + ITEM_PTRS_SIZE(new_count), GFP_KERNEL); + if (new_count == 0) + return 0; + if (!prio_array) + return -ENOMEM; + old_count = test_parman->prio_array_limit; + if (new_count > old_count) + memset(&prio_array[old_count], 0, + ITEM_PTRS_SIZE(new_count - old_count)); + test_parman->prio_array = prio_array; + test_parman->prio_array_limit = new_count; + return 0; +} + +static void test_parman_move(void *priv, unsigned long from_index, + unsigned long to_index, unsigned long count) +{ + struct test_parman *test_parman = priv; + struct test_parman_item **prio_array = test_parman->prio_array; + + memmove(&prio_array[to_index], &prio_array[from_index], + ITEM_PTRS_SIZE(count)); + memset(&prio_array[from_index], 0, ITEM_PTRS_SIZE(count)); +} + +static const struct parman_ops test_parman_lsort_ops = { + .base_count = TEST_PARMAN_BASE_COUNT, + .resize_step = TEST_PARMAN_RESIZE_STEP_COUNT, + .resize = test_parman_resize, + .move = test_parman_move, + .algo = PARMAN_ALGO_TYPE_LSORT, +}; + +static void test_parman_rnd_init(struct test_parman *test_parman) +{ + prandom_seed_state(&test_parman->rnd, 3141592653589793238ULL); +} + +static u32 test_parman_rnd_get(struct test_parman *test_parman) +{ + return prandom_u32_state(&test_parman->rnd); +} + +static unsigned long test_parman_priority_gen(struct test_parman *test_parman) +{ + unsigned long priority; + int i; + +again: + priority = test_parman_rnd_get(test_parman); + if (priority == 0) + goto again; + + for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) { + struct test_parman_prio *prio = &test_parman->prios[i]; + + if (prio->priority == 0) + break; + if (prio->priority == priority) + goto again; + } + return priority; +} + +static void test_parman_prios_init(struct test_parman *test_parman) +{ + int i; + + for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) { + struct test_parman_prio *prio = &test_parman->prios[i]; + + /* Assign random uniqueue priority to each prio structure */ + prio->priority = test_parman_priority_gen(test_parman); + parman_prio_init(test_parman->parman, &prio->parman_prio, + prio->priority); + } +} + +static void test_parman_prios_fini(struct test_parman *test_parman) +{ + int i; + + for (i = 0; i < TEST_PARMAN_PRIO_COUNT; i++) { + struct test_parman_prio *prio = &test_parman->prios[i]; + + parman_prio_fini(&prio->parman_prio); + } +} + +static void test_parman_items_init(struct test_parman *test_parman) +{ + int i; + + for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) { + struct test_parman_item *item = &test_parman->items[i]; + unsigned int prio_index = test_parman_rnd_get(test_parman) & + TEST_PARMAN_PRIO_MASK; + + /* Assign random prio to each item structure */ + item->prio = &test_parman->prios[prio_index]; + } +} + +static void test_parman_items_fini(struct test_parman *test_parman) +{ + int i; + + for (i = 0; i < TEST_PARMAN_ITEM_COUNT; i++) { + struct test_parman_item *item = &test_parman->items[i]; + + if (!item->used) + continue; + parman_item_remove(test_parman->parman, + &item->prio->parman_prio, + &item->parman_item); + } +} + +static struct test_parman *test_parman_create(const struct parman_ops *ops) +{ + struct test_parman *test_parman; + int err; + + test_parman = kzalloc(sizeof(*test_parman), GFP_KERNEL); + if (!test_parman) + return ERR_PTR(-ENOMEM); + err = test_parman_resize(test_parman, TEST_PARMAN_BASE_COUNT); + if (err) + goto err_resize; + test_parman->parman = parman_create(ops, test_parman); + if (!test_parman->parman) { + err = -ENOMEM; + goto err_parman_create; + } + test_parman_rnd_init(test_parman); + test_parman_prios_init(test_parman); + test_parman_items_init(test_parman); + test_parman->run_budget = TEST_PARMAN_RUN_BUDGET; + return test_parman; + +err_parman_create: + test_parman_resize(test_parman, 0); +err_resize: + kfree(test_parman); + return ERR_PTR(err); +} + +static void test_parman_destroy(struct test_parman *test_parman) +{ + test_parman_items_fini(test_parman); + test_parman_prios_fini(test_parman); + parman_destroy(test_parman->parman); + test_parman_resize(test_parman, 0); + kfree(test_parman); +} + +static bool test_parman_run_check_budgets(struct test_parman *test_parman) +{ + if (test_parman->run_budget-- == 0) + return false; + if (test_parman->bulk_budget-- != 0) + return true; + + test_parman->bulk_budget = test_parman_rnd_get(test_parman) & + TEST_PARMAN_BULK_MAX_MASK; + test_parman->bulk_noop = test_parman_rnd_get(test_parman) & 1; + return true; +} + +static int test_parman_run(struct test_parman *test_parman) +{ + unsigned int i = test_parman_rnd_get(test_parman); + int err; + + while (test_parman_run_check_budgets(test_parman)) { + unsigned int item_index = i++ & TEST_PARMAN_ITEM_MASK; + struct test_parman_item *item = &test_parman->items[item_index]; + + if (test_parman->bulk_noop) + continue; + + if (!item->used) { + err = parman_item_add(test_parman->parman, + &item->prio->parman_prio, + &item->parman_item); + if (err) + return err; + test_parman->prio_array[item->parman_item.index] = item; + test_parman->used_items++; + } else { + test_parman->prio_array[item->parman_item.index] = NULL; + parman_item_remove(test_parman->parman, + &item->prio->parman_prio, + &item->parman_item); + test_parman->used_items--; + } + item->used = !item->used; + } + return 0; +} + +static int test_parman_check_array(struct test_parman *test_parman, + bool gaps_allowed) +{ + unsigned int last_unused_items = 0; + unsigned long last_priority = 0; + unsigned int used_items = 0; + int i; + + if (test_parman->prio_array_limit < TEST_PARMAN_BASE_COUNT) { + pr_err("Array limit is lower than the base count (%lu < %lu)\n", + test_parman->prio_array_limit, TEST_PARMAN_BASE_COUNT); + return -EINVAL; + } + + for (i = 0; i < test_parman->prio_array_limit; i++) { + struct test_parman_item *item = test_parman->prio_array[i]; + + if (!item) { + last_unused_items++; + continue; + } + if (last_unused_items && !gaps_allowed) { + pr_err("Gap found in array even though they are forbidden\n"); + return -EINVAL; + } + + last_unused_items = 0; + used_items++; + + if (item->prio->priority < last_priority) { + pr_err("Item belongs under higher priority then the last one (current: %lu, previous: %lu)\n", + item->prio->priority, last_priority); + return -EINVAL; + } + last_priority = item->prio->priority; + + if (item->parman_item.index != i) { + pr_err("Item has different index in compare to where it actualy is (%lu != %d)\n", + item->parman_item.index, i); + return -EINVAL; + } + } + + if (used_items != test_parman->used_items) { + pr_err("Number of used items in array does not match (%u != %u)\n", + used_items, test_parman->used_items); + return -EINVAL; + } + + if (last_unused_items >= TEST_PARMAN_RESIZE_STEP_COUNT) { + pr_err("Number of unused item at the end of array is bigger than resize step (%u >= %lu)\n", + last_unused_items, TEST_PARMAN_RESIZE_STEP_COUNT); + return -EINVAL; + } + + pr_info("Priority array check successful\n"); + + return 0; +} + +static int test_parman_lsort(void) +{ + struct test_parman *test_parman; + int err; + + test_parman = test_parman_create(&test_parman_lsort_ops); + if (IS_ERR(test_parman)) + return PTR_ERR(test_parman); + + err = test_parman_run(test_parman); + if (err) + goto out; + + err = test_parman_check_array(test_parman, false); + if (err) + goto out; +out: + test_parman_destroy(test_parman); + return err; +} + +static int __init test_parman_init(void) +{ + return test_parman_lsort(); +} + +static void __exit test_parman_exit(void) +{ +} + +module_init(test_parman_init); +module_exit(test_parman_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>"); +MODULE_DESCRIPTION("Test module for parman"); diff --git a/lib/test_siphash.c b/lib/test_siphash.c new file mode 100644 index 000000000000..a6d854d933bf --- /dev/null +++ b/lib/test_siphash.c @@ -0,0 +1,223 @@ +/* Test cases for siphash.c + * + * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. + * + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * + * This implementation is specifically for SipHash2-4 for a secure PRF + * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for + * hashtables. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/siphash.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/module.h> + +/* Test vectors taken from reference source available at: + * https://github.com/veorq/SipHash + */ + +static const siphash_key_t test_key_siphash = + {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }}; + +static const u64 test_vectors_siphash[64] = { + 0x726fdb47dd0e0e31ULL, 0x74f839c593dc67fdULL, 0x0d6c8009d9a94f5aULL, + 0x85676696d7fb7e2dULL, 0xcf2794e0277187b7ULL, 0x18765564cd99a68dULL, + 0xcbc9466e58fee3ceULL, 0xab0200f58b01d137ULL, 0x93f5f5799a932462ULL, + 0x9e0082df0ba9e4b0ULL, 0x7a5dbbc594ddb9f3ULL, 0xf4b32f46226bada7ULL, + 0x751e8fbc860ee5fbULL, 0x14ea5627c0843d90ULL, 0xf723ca908e7af2eeULL, + 0xa129ca6149be45e5ULL, 0x3f2acc7f57c29bdbULL, 0x699ae9f52cbe4794ULL, + 0x4bc1b3f0968dd39cULL, 0xbb6dc91da77961bdULL, 0xbed65cf21aa2ee98ULL, + 0xd0f2cbb02e3b67c7ULL, 0x93536795e3a33e88ULL, 0xa80c038ccd5ccec8ULL, + 0xb8ad50c6f649af94ULL, 0xbce192de8a85b8eaULL, 0x17d835b85bbb15f3ULL, + 0x2f2e6163076bcfadULL, 0xde4daaaca71dc9a5ULL, 0xa6a2506687956571ULL, + 0xad87a3535c49ef28ULL, 0x32d892fad841c342ULL, 0x7127512f72f27cceULL, + 0xa7f32346f95978e3ULL, 0x12e0b01abb051238ULL, 0x15e034d40fa197aeULL, + 0x314dffbe0815a3b4ULL, 0x027990f029623981ULL, 0xcadcd4e59ef40c4dULL, + 0x9abfd8766a33735cULL, 0x0e3ea96b5304a7d0ULL, 0xad0c42d6fc585992ULL, + 0x187306c89bc215a9ULL, 0xd4a60abcf3792b95ULL, 0xf935451de4f21df2ULL, + 0xa9538f0419755787ULL, 0xdb9acddff56ca510ULL, 0xd06c98cd5c0975ebULL, + 0xe612a3cb9ecba951ULL, 0xc766e62cfcadaf96ULL, 0xee64435a9752fe72ULL, + 0xa192d576b245165aULL, 0x0a8787bf8ecb74b2ULL, 0x81b3e73d20b49b6fULL, + 0x7fa8220ba3b2eceaULL, 0x245731c13ca42499ULL, 0xb78dbfaf3a8d83bdULL, + 0xea1ad565322a1a0bULL, 0x60e61c23a3795013ULL, 0x6606d7e446282b93ULL, + 0x6ca4ecb15c5f91e1ULL, 0x9f626da15c9625f3ULL, 0xe51b38608ef25f57ULL, + 0x958a324ceb064572ULL +}; + +#if BITS_PER_LONG == 64 +static const hsiphash_key_t test_key_hsiphash = + {{ 0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL }}; + +static const u32 test_vectors_hsiphash[64] = { + 0x050fc4dcU, 0x7d57ca93U, 0x4dc7d44dU, + 0xe7ddf7fbU, 0x88d38328U, 0x49533b67U, + 0xc59f22a7U, 0x9bb11140U, 0x8d299a8eU, + 0x6c063de4U, 0x92ff097fU, 0xf94dc352U, + 0x57b4d9a2U, 0x1229ffa7U, 0xc0f95d34U, + 0x2a519956U, 0x7d908b66U, 0x63dbd80cU, + 0xb473e63eU, 0x8d297d1cU, 0xa6cce040U, + 0x2b45f844U, 0xa320872eU, 0xdae6c123U, + 0x67349c8cU, 0x705b0979U, 0xca9913a5U, + 0x4ade3b35U, 0xef6cd00dU, 0x4ab1e1f4U, + 0x43c5e663U, 0x8c21d1bcU, 0x16a7b60dU, + 0x7a8ff9bfU, 0x1f2a753eU, 0xbf186b91U, + 0xada26206U, 0xa3c33057U, 0xae3a36a1U, + 0x7b108392U, 0x99e41531U, 0x3f1ad944U, + 0xc8138825U, 0xc28949a6U, 0xfaf8876bU, + 0x9f042196U, 0x68b1d623U, 0x8b5114fdU, + 0xdf074c46U, 0x12cc86b3U, 0x0a52098fU, + 0x9d292f9aU, 0xa2f41f12U, 0x43a71ed0U, + 0x73f0bce6U, 0x70a7e980U, 0x243c6d75U, + 0xfdb71513U, 0xa67d8a08U, 0xb7e8f148U, + 0xf7a644eeU, 0x0f1837f2U, 0x4b6694e0U, + 0xb7bbb3a8U +}; +#else +static const hsiphash_key_t test_key_hsiphash = + {{ 0x03020100U, 0x07060504U }}; + +static const u32 test_vectors_hsiphash[64] = { + 0x5814c896U, 0xe7e864caU, 0xbc4b0e30U, + 0x01539939U, 0x7e059ea6U, 0x88e3d89bU, + 0xa0080b65U, 0x9d38d9d6U, 0x577999b1U, + 0xc839caedU, 0xe4fa32cfU, 0x959246eeU, + 0x6b28096cU, 0x66dd9cd6U, 0x16658a7cU, + 0xd0257b04U, 0x8b31d501U, 0x2b1cd04bU, + 0x06712339U, 0x522aca67U, 0x911bb605U, + 0x90a65f0eU, 0xf826ef7bU, 0x62512debU, + 0x57150ad7U, 0x5d473507U, 0x1ec47442U, + 0xab64afd3U, 0x0a4100d0U, 0x6d2ce652U, + 0x2331b6a3U, 0x08d8791aU, 0xbc6dda8dU, + 0xe0f6c934U, 0xb0652033U, 0x9b9851ccU, + 0x7c46fb7fU, 0x732ba8cbU, 0xf142997aU, + 0xfcc9aa1bU, 0x05327eb2U, 0xe110131cU, + 0xf9e5e7c0U, 0xa7d708a6U, 0x11795ab1U, + 0x65671619U, 0x9f5fff91U, 0xd89c5267U, + 0x007783ebU, 0x95766243U, 0xab639262U, + 0x9c7e1390U, 0xc368dda6U, 0x38ddc455U, + 0xfa13d379U, 0x979ea4e8U, 0x53ecd77eU, + 0x2ee80657U, 0x33dbb66aU, 0xae3f0577U, + 0x88b4c4ccU, 0x3e7f480bU, 0x74c1ebf8U, + 0x87178304U +}; +#endif + +static int __init siphash_test_init(void) +{ + u8 in[64] __aligned(SIPHASH_ALIGNMENT); + u8 in_unaligned[65] __aligned(SIPHASH_ALIGNMENT); + u8 i; + int ret = 0; + + for (i = 0; i < 64; ++i) { + in[i] = i; + in_unaligned[i + 1] = i; + if (siphash(in, i, &test_key_siphash) != + test_vectors_siphash[i]) { + pr_info("siphash self-test aligned %u: FAIL\n", i + 1); + ret = -EINVAL; + } + if (siphash(in_unaligned + 1, i, &test_key_siphash) != + test_vectors_siphash[i]) { + pr_info("siphash self-test unaligned %u: FAIL\n", i + 1); + ret = -EINVAL; + } + if (hsiphash(in, i, &test_key_hsiphash) != + test_vectors_hsiphash[i]) { + pr_info("hsiphash self-test aligned %u: FAIL\n", i + 1); + ret = -EINVAL; + } + if (hsiphash(in_unaligned + 1, i, &test_key_hsiphash) != + test_vectors_hsiphash[i]) { + pr_info("hsiphash self-test unaligned %u: FAIL\n", i + 1); + ret = -EINVAL; + } + } + if (siphash_1u64(0x0706050403020100ULL, &test_key_siphash) != + test_vectors_siphash[8]) { + pr_info("siphash self-test 1u64: FAIL\n"); + ret = -EINVAL; + } + if (siphash_2u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + &test_key_siphash) != test_vectors_siphash[16]) { + pr_info("siphash self-test 2u64: FAIL\n"); + ret = -EINVAL; + } + if (siphash_3u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + 0x1716151413121110ULL, &test_key_siphash) != + test_vectors_siphash[24]) { + pr_info("siphash self-test 3u64: FAIL\n"); + ret = -EINVAL; + } + if (siphash_4u64(0x0706050403020100ULL, 0x0f0e0d0c0b0a0908ULL, + 0x1716151413121110ULL, 0x1f1e1d1c1b1a1918ULL, + &test_key_siphash) != test_vectors_siphash[32]) { + pr_info("siphash self-test 4u64: FAIL\n"); + ret = -EINVAL; + } + if (siphash_1u32(0x03020100U, &test_key_siphash) != + test_vectors_siphash[4]) { + pr_info("siphash self-test 1u32: FAIL\n"); + ret = -EINVAL; + } + if (siphash_2u32(0x03020100U, 0x07060504U, &test_key_siphash) != + test_vectors_siphash[8]) { + pr_info("siphash self-test 2u32: FAIL\n"); + ret = -EINVAL; + } + if (siphash_3u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, &test_key_siphash) != + test_vectors_siphash[12]) { + pr_info("siphash self-test 3u32: FAIL\n"); + ret = -EINVAL; + } + if (siphash_4u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, 0x0f0e0d0cU, &test_key_siphash) != + test_vectors_siphash[16]) { + pr_info("siphash self-test 4u32: FAIL\n"); + ret = -EINVAL; + } + if (hsiphash_1u32(0x03020100U, &test_key_hsiphash) != + test_vectors_hsiphash[4]) { + pr_info("hsiphash self-test 1u32: FAIL\n"); + ret = -EINVAL; + } + if (hsiphash_2u32(0x03020100U, 0x07060504U, &test_key_hsiphash) != + test_vectors_hsiphash[8]) { + pr_info("hsiphash self-test 2u32: FAIL\n"); + ret = -EINVAL; + } + if (hsiphash_3u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, &test_key_hsiphash) != + test_vectors_hsiphash[12]) { + pr_info("hsiphash self-test 3u32: FAIL\n"); + ret = -EINVAL; + } + if (hsiphash_4u32(0x03020100U, 0x07060504U, + 0x0b0a0908U, 0x0f0e0d0cU, &test_key_hsiphash) != + test_vectors_hsiphash[16]) { + pr_info("hsiphash self-test 4u32: FAIL\n"); + ret = -EINVAL; + } + if (!ret) + pr_info("self-tests: pass\n"); + return ret; +} + +static void __exit siphash_test_exit(void) +{ +} + +module_init(siphash_test_init); +module_exit(siphash_test_exit); + +MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 0ecef3e4690e..6f335a3d4ae2 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -25,6 +25,23 @@ #include <linux/uaccess.h> #include <linux/vmalloc.h> +/* + * Several 32-bit architectures support 64-bit {get,put}_user() calls. + * As there doesn't appear to be anything that can safely determine + * their capability at compile-time, we just have to opt-out certain archs. + */ +#if BITS_PER_LONG == 64 || (!defined(CONFIG_AVR32) && \ + !defined(CONFIG_BLACKFIN) && \ + !defined(CONFIG_M32R) && \ + !defined(CONFIG_M68K) && \ + !defined(CONFIG_MICROBLAZE) && \ + !defined(CONFIG_MN10300) && \ + !defined(CONFIG_NIOS2) && \ + !defined(CONFIG_PPC32) && \ + !defined(CONFIG_SUPERH)) +# define TEST_U64 +#endif + #define test(condition, msg) \ ({ \ int cond = (condition); \ @@ -40,7 +57,12 @@ static int __init test_user_copy_init(void) char __user *usermem; char *bad_usermem; unsigned long user_addr; - unsigned long value = 0x5A; + u8 val_u8; + u16 val_u16; + u32 val_u32; +#ifdef TEST_U64 + u64 val_u64; +#endif kmem = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); if (!kmem) @@ -58,33 +80,100 @@ static int __init test_user_copy_init(void) usermem = (char __user *)user_addr; bad_usermem = (char *)user_addr; - /* Legitimate usage: none of these should fail. */ - ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE), - "legitimate copy_from_user failed"); + /* + * Legitimate usage: none of these copies should fail. + */ + memset(kmem, 0x3a, PAGE_SIZE * 2); ret |= test(copy_to_user(usermem, kmem, PAGE_SIZE), "legitimate copy_to_user failed"); - ret |= test(get_user(value, (unsigned long __user *)usermem), - "legitimate get_user failed"); - ret |= test(put_user(value, (unsigned long __user *)usermem), - "legitimate put_user failed"); - - /* Invalid usage: none of these should succeed. */ + memset(kmem, 0x0, PAGE_SIZE); + ret |= test(copy_from_user(kmem, usermem, PAGE_SIZE), + "legitimate copy_from_user failed"); + ret |= test(memcmp(kmem, kmem + PAGE_SIZE, PAGE_SIZE), + "legitimate usercopy failed to copy data"); + +#define test_legit(size, check) \ + do { \ + val_##size = check; \ + ret |= test(put_user(val_##size, (size __user *)usermem), \ + "legitimate put_user (" #size ") failed"); \ + val_##size = 0; \ + ret |= test(get_user(val_##size, (size __user *)usermem), \ + "legitimate get_user (" #size ") failed"); \ + ret |= test(val_##size != check, \ + "legitimate get_user (" #size ") failed to do copy"); \ + if (val_##size != check) { \ + pr_info("0x%llx != 0x%llx\n", \ + (unsigned long long)val_##size, \ + (unsigned long long)check); \ + } \ + } while (0) + + test_legit(u8, 0x5a); + test_legit(u16, 0x5a5b); + test_legit(u32, 0x5a5b5c5d); +#ifdef TEST_U64 + test_legit(u64, 0x5a5b5c5d6a6b6c6d); +#endif +#undef test_legit + + /* + * Invalid usage: none of these copies should succeed. + */ + + /* Prepare kernel memory with check values. */ + memset(kmem, 0x5a, PAGE_SIZE); + memset(kmem + PAGE_SIZE, 0, PAGE_SIZE); + + /* Reject kernel-to-kernel copies through copy_from_user(). */ ret |= test(!copy_from_user(kmem, (char __user *)(kmem + PAGE_SIZE), PAGE_SIZE), "illegal all-kernel copy_from_user passed"); + + /* Destination half of buffer should have been zeroed. */ + ret |= test(memcmp(kmem + PAGE_SIZE, kmem, PAGE_SIZE), + "zeroing failure for illegal all-kernel copy_from_user"); + +#if 0 + /* + * When running with SMAP/PAN/etc, this will Oops the kernel + * due to the zeroing of userspace memory on failure. This needs + * to be tested in LKDTM instead, since this test module does not + * expect to explode. + */ ret |= test(!copy_from_user(bad_usermem, (char __user *)kmem, PAGE_SIZE), "illegal reversed copy_from_user passed"); +#endif ret |= test(!copy_to_user((char __user *)kmem, kmem + PAGE_SIZE, PAGE_SIZE), "illegal all-kernel copy_to_user passed"); ret |= test(!copy_to_user((char __user *)kmem, bad_usermem, PAGE_SIZE), "illegal reversed copy_to_user passed"); - ret |= test(!get_user(value, (unsigned long __user *)kmem), - "illegal get_user passed"); - ret |= test(!put_user(value, (unsigned long __user *)kmem), - "illegal put_user passed"); + +#define test_illegal(size, check) \ + do { \ + val_##size = (check); \ + ret |= test(!get_user(val_##size, (size __user *)kmem), \ + "illegal get_user (" #size ") passed"); \ + ret |= test(val_##size != (size)0, \ + "zeroing failure for illegal get_user (" #size ")"); \ + if (val_##size != (size)0) { \ + pr_info("0x%llx != 0\n", \ + (unsigned long long)val_##size); \ + } \ + ret |= test(!put_user(val_##size, (size __user *)kmem), \ + "illegal put_user (" #size ") passed"); \ + } while (0) + + test_illegal(u8, 0x5a); + test_illegal(u16, 0x5a5b); + test_illegal(u32, 0x5a5b5c5d); +#ifdef TEST_U64 + test_illegal(u64, 0x5a5b5c5d6a6b6c6d); +#endif +#undef test_illegal vm_munmap(user_addr, PAGE_SIZE * 2); kfree(kmem); diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 782ae8ca2c06..4a720ed4fdaf 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -48,7 +48,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) while (*p) { parent = *p; ptr = rb_entry(parent, struct timerqueue_node, node); - if (node->expires.tv64 < ptr->expires.tv64) + if (node->expires < ptr->expires) p = &(*p)->rb_left; else p = &(*p)->rb_right; @@ -56,7 +56,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); - if (!head->next || node->expires.tv64 < head->next->expires.tv64) { + if (!head->next || node->expires < head->next->expires) { head->next = node; return true; } @@ -80,8 +80,7 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) if (head->next == node) { struct rb_node *rbn = rb_next(&node->node); - head->next = rbn ? - rb_entry(rbn, struct timerqueue_node, node) : NULL; + head->next = rb_entry_safe(rbn, struct timerqueue_node, node); } rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); diff --git a/lib/win_minmax.c b/lib/win_minmax.c new file mode 100644 index 000000000000..c8420d404926 --- /dev/null +++ b/lib/win_minmax.c @@ -0,0 +1,98 @@ +/** + * lib/minmax.c: windowed min/max tracker + * + * Kathleen Nichols' algorithm for tracking the minimum (or maximum) + * value of a data stream over some fixed time interval. (E.g., + * the minimum RTT over the past five minutes.) It uses constant + * space and constant time per update yet almost always delivers + * the same minimum as an implementation that has to keep all the + * data in the window. + * + * The algorithm keeps track of the best, 2nd best & 3rd best min + * values, maintaining an invariant that the measurement time of + * the n'th best >= n-1'th best. It also makes sure that the three + * values are widely separated in the time window since that bounds + * the worse case error when that data is monotonically increasing + * over the window. + * + * Upon getting a new min, we can forget everything earlier because + * it has no value - the new min is <= everything else in the window + * by definition and it's the most recent. So we restart fresh on + * every new min and overwrites 2nd & 3rd choices. The same property + * holds for 2nd & 3rd best. + */ +#include <linux/module.h> +#include <linux/win_minmax.h> + +/* As time advances, update the 1st, 2nd, and 3rd choices. */ +static u32 minmax_subwin_update(struct minmax *m, u32 win, + const struct minmax_sample *val) +{ + u32 dt = val->t - m->s[0].t; + + if (unlikely(dt > win)) { + /* + * Passed entire window without a new val so make 2nd + * choice the new val & 3rd choice the new 2nd choice. + * we may have to iterate this since our 2nd choice + * may also be outside the window (we checked on entry + * that the third choice was in the window). + */ + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + if (unlikely(val->t - m->s[0].t > win)) { + m->s[0] = m->s[1]; + m->s[1] = m->s[2]; + m->s[2] = *val; + } + } else if (unlikely(m->s[1].t == m->s[0].t) && dt > win/4) { + /* + * We've passed a quarter of the window without a new val + * so take a 2nd choice from the 2nd quarter of the window. + */ + m->s[2] = m->s[1] = *val; + } else if (unlikely(m->s[2].t == m->s[1].t) && dt > win/2) { + /* + * We've passed half the window without finding a new val + * so take a 3rd choice from the last half of the window + */ + m->s[2] = *val; + } + return m->s[0].v; +} + +/* Check if new measurement updates the 1st, 2nd or 3rd choice max. */ +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v >= m->s[0].v) || /* found new max? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v >= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v >= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} +EXPORT_SYMBOL(minmax_running_max); + +/* Check if new measurement updates the 1st, 2nd or 3rd choice min. */ +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + if (unlikely(val.v <= m->s[0].v) || /* found new min? */ + unlikely(val.t - m->s[2].t > win)) /* nothing left in window? */ + return minmax_reset(m, t, meas); /* forget earlier samples */ + + if (unlikely(val.v <= m->s[1].v)) + m->s[2] = m->s[1] = val; + else if (unlikely(val.v <= m->s[2].v)) + m->s[2] = val; + + return minmax_subwin_update(m, win, &val); +} |