diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Kconfig.debug | 72 | ||||
-rw-r--r-- | lib/Kconfig.ubsan | 3 | ||||
-rw-r--r-- | lib/Makefile | 1 | ||||
-rw-r--r-- | lib/cpu-notifier-error-inject.c | 84 | ||||
-rw-r--r-- | lib/debugobjects.c | 2 | ||||
-rw-r--r-- | lib/extable.c | 2 | ||||
-rw-r--r-- | lib/idr.c | 11 | ||||
-rw-r--r-- | lib/ioremap.c | 1 | ||||
-rw-r--r-- | lib/iov_iter.c | 204 | ||||
-rw-r--r-- | lib/kobject_uevent.c | 6 | ||||
-rw-r--r-- | lib/kstrtox.c | 2 | ||||
-rw-r--r-- | lib/list_debug.c | 99 | ||||
-rw-r--r-- | lib/lockref.c | 2 | ||||
-rw-r--r-- | lib/nlattr.c | 2 | ||||
-rw-r--r-- | lib/parser.c | 47 | ||||
-rw-r--r-- | lib/percpu_counter.c | 25 | ||||
-rw-r--r-- | lib/radix-tree.c | 1204 | ||||
-rw-r--r-- | lib/raid6/avx2.c | 232 | ||||
-rw-r--r-- | lib/rbtree.c | 23 | ||||
-rw-r--r-- | lib/swiotlb.c | 139 | ||||
-rw-r--r-- | lib/timerqueue.c | 4 |
21 files changed, 1403 insertions, 762 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a6c8db1d62f6..eb9e9a7870fa 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -13,7 +13,22 @@ config PRINTK_TIME be included, not that the timestamp is recorded. The behavior is also controlled by the kernel command line - parameter printk.time=1. See Documentation/kernel-parameters.txt + parameter printk.time=1. See Documentation/admin-guide/kernel-parameters.rst + +config CONSOLE_LOGLEVEL_DEFAULT + int "Default console loglevel (1-15)" + range 1 15 + default "7" + help + Default loglevel to determine what will be printed on the console. + + Setting a default here is equivalent to passing in loglevel=<x> in + the kernel bootargs. loglevel=<x> continues to override whatever + value is specified here as well. + + Note: This does not affect the log level of un-prefixed printk() + usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT + option. config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" @@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT that are auditing their logs closely may want to set it to a lower priority. + Note: This does not affect what message level gets printed on the console + by default. To change that, use loglevel=<x> in the kernel bootargs, + or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value. + config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY @@ -145,7 +164,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO + depends on DEBUG_INFO && !FRV help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, @@ -175,8 +194,8 @@ config GDB_SCRIPTS build directory. If you load vmlinux into gdb, the helper scripts will be automatically imported by gdb as well, and additional functions are available to analyze a Linux kernel - instance. See Documentation/gdb-kernel-debugging.txt for further - details. + instance. See Documentation/dev-tools/gdb-kernel-debugging.rst + for further details. config ENABLE_WARN_DEPRECATED bool "Enable __deprecated logic" @@ -523,7 +542,7 @@ config DEBUG_KMEMLEAK difference being that the orphan objects are not freed but only shown in /sys/kernel/debug/kmemleak. Enabling this feature will introduce an overhead to memory - allocations. See Documentation/kmemleak.txt for more + allocations. See Documentation/dev-tools/kmemleak.rst for more details. Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances @@ -720,7 +739,7 @@ config KCOV different machines and across reboots. If you need stable PC values, disable RANDOMIZE_BASE. - For more details, see Documentation/kcov.txt. + For more details, see Documentation/dev-tools/kcov.rst. config KCOV_INSTRUMENT_ALL bool "Instrument all code by default" @@ -1218,7 +1237,7 @@ config DEBUG_BUGVERBOSE config DEBUG_LIST bool "Debug linked list manipulation" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION help Enable this to turn on extended checks in the linked-list walking routines. @@ -1434,7 +1453,8 @@ config RCU_TRACE select TRACE_CLOCK help This option provides tracing in RCU which presents stats - in debugfs for debugging RCU implementation. + in debugfs for debugging RCU implementation. It also enables + additional tracepoints for ftrace-style event tracing. Say Y here if you want to enable RCU tracing Say N if you are unsure. @@ -1518,30 +1538,6 @@ config NOTIFIER_ERROR_INJECTION Say N if unsure. -config CPU_NOTIFIER_ERROR_INJECT - tristate "CPU notifier error injection module" - depends on HOTPLUG_CPU && NOTIFIER_ERROR_INJECTION - help - This option provides a kernel module that can be used to test - the error handling of the cpu notifiers by injecting artificial - errors to CPU notifier chain callbacks. It is controlled through - debugfs interface under /sys/kernel/debug/notifier-error-inject/cpu - - If the notifier call chain should be failed with some events - notified, write the error code to "actions/<notifier event>/error". - - Example: Inject CPU offline error (-1 == -EPERM) - - # cd /sys/kernel/debug/notifier-error-inject/cpu - # echo -1 > actions/CPU_DOWN_PREPARE/error - # echo 0 > /sys/devices/system/cpu/cpu1/online - bash: echo: write error: Operation not permitted - - To compile this code as a module, choose M here: the module will - be called cpu-notifier-error-inject. - - If unsure, say N. - config PM_NOTIFIER_ERROR_INJECT tristate "PM notifier error injection module" depends on PM && NOTIFIER_ERROR_INJECTION @@ -1964,6 +1960,16 @@ config TEST_STATIC_KEYS If unsure, say N. +config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked + for validity. + + If unsure, say N. + source "samples/Kconfig" source "lib/Kconfig.kgdb" @@ -1975,7 +1981,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED config STRICT_DEVMEM bool "Filter access to /dev/mem" - depends on MMU + depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED default y if TILE || PPC ---help--- diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bc6e651df68c..a669c193b878 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -10,7 +10,8 @@ config UBSAN This option enables undefined behaviour sanity checker Compile-time instrumentation is used to detect various undefined behaviours in runtime. Various types of checks may be enabled - via boot parameter ubsan_handle (see: Documentation/ubsan.txt). + via boot parameter ubsan_handle + (see: Documentation/dev-tools/ubsan.rst). config UBSAN_SANITIZE_ALL bool "Enable instrumentation for the entire kernel" diff --git a/lib/Makefile b/lib/Makefile index 50144a3aeebd..bc4073a8cd08 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -128,7 +128,6 @@ obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o -obj-$(CONFIG_CPU_NOTIFIER_ERROR_INJECT) += cpu-notifier-error-inject.o obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o diff --git a/lib/cpu-notifier-error-inject.c b/lib/cpu-notifier-error-inject.c deleted file mode 100644 index 0e2c9a1e958a..000000000000 --- a/lib/cpu-notifier-error-inject.c +++ /dev/null @@ -1,84 +0,0 @@ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/cpu.h> - -#include "notifier-error-inject.h" - -static int priority; -module_param(priority, int, 0); -MODULE_PARM_DESC(priority, "specify cpu notifier priority"); - -#define UP_PREPARE 0 -#define UP_PREPARE_FROZEN 0 -#define DOWN_PREPARE 0 -#define DOWN_PREPARE_FROZEN 0 - -static struct notifier_err_inject cpu_notifier_err_inject = { - .actions = { - { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(UP_PREPARE_FROZEN) }, - { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE) }, - { NOTIFIER_ERR_INJECT_ACTION(DOWN_PREPARE_FROZEN) }, - {} - } -}; - -static int notf_err_handle(struct notifier_err_inject_action *action) -{ - int ret; - - ret = action->error; - if (ret) - pr_info("Injecting error (%d) to %s\n", ret, action->name); - return ret; -} - -static int notf_err_inj_up_prepare(unsigned int cpu) -{ - if (!cpuhp_tasks_frozen) - return notf_err_handle(&cpu_notifier_err_inject.actions[0]); - else - return notf_err_handle(&cpu_notifier_err_inject.actions[1]); -} - -static int notf_err_inj_dead(unsigned int cpu) -{ - if (!cpuhp_tasks_frozen) - return notf_err_handle(&cpu_notifier_err_inject.actions[2]); - else - return notf_err_handle(&cpu_notifier_err_inject.actions[3]); -} - -static struct dentry *dir; - -static int err_inject_init(void) -{ - int err; - - dir = notifier_err_inject_init("cpu", notifier_err_inject_dir, - &cpu_notifier_err_inject, priority); - if (IS_ERR(dir)) - return PTR_ERR(dir); - - err = cpuhp_setup_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE, - "cpu-err-notif:prepare", - notf_err_inj_up_prepare, - notf_err_inj_dead); - if (err) - debugfs_remove_recursive(dir); - - return err; -} - -static void err_inject_exit(void) -{ - cpuhp_remove_state_nocalls(CPUHP_NOTF_ERR_INJ_PREPARE); - debugfs_remove_recursive(dir); -} - -module_init(err_inject_init); -module_exit(err_inject_exit); - -MODULE_DESCRIPTION("CPU notifier error injection module"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Akinobu Mita <akinobu.mita@gmail.com>"); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 056052dc8e91..04c1ef717fe0 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -199,7 +199,7 @@ static void free_object(struct debug_obj *obj) * initialized: */ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up(); + sched = 1; hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; diff --git a/lib/extable.c b/lib/extable.c index 0be02ad561e9..62968daa66a9 100644 --- a/lib/extable.c +++ b/lib/extable.c @@ -12,7 +12,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/sort.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #ifndef ARCH_HAS_RELATIVE_EXTABLE #define ex_to_insn(x) ((x)->insn) diff --git a/lib/idr.c b/lib/idr.c index 6098336df267..52d2979a05e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get); * and go back to the ida_pre_get() call. If the ida is full, it will * return %-ENOSPC. * + * Note that callers must ensure that concurrent access to @ida is not possible. + * See ida_simple_get() for a varaint which takes care of locking. + * * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) @@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy); * Allocates an id in the range start <= id < end, or returns -ENOSPC. * On memory allocation failure, returns -ENOMEM. * + * Compared to ida_get_new_above() this function does its own locking, and + * should be used unless there are special requirements. + * * Use ida_simple_remove() to get rid of an id. */ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, @@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get); * ida_simple_remove - remove an allocated id. * @ida: the (initialized) ida. * @id: the id returned by ida_simple_get. + * + * Use to release an id allocated with ida_simple_get(). + * + * Compared to ida_remove() this function does its own locking, and should be + * used unless there are special requirements. */ void ida_simple_remove(struct ida *ida, unsigned int id) { diff --git a/lib/ioremap.c b/lib/ioremap.c index 86c8911b0e3a..a3e14ce92a56 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr, return err; } -EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f2bd21b93dfc..e68604ae3ced 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1,4 +1,5 @@ #include <linux/export.h> +#include <linux/bvec.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/slab.h> @@ -72,19 +73,21 @@ } #define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - struct bio_vec v; \ - struct bvec_iter __bi; \ - iterate_bvec(i, n, v, __bi, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ + if (likely(n)) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + struct bio_vec v; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ } \ } @@ -568,6 +571,31 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter); +bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full); + size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { char *to = addr; @@ -587,6 +615,30 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(copy_from_iter_nocache); +bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + iterate_all_kinds(i, bytes, v, ({ + if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len)) + return false; + 0;}), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(copy_from_iter_full_nocache); + size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { @@ -678,43 +730,50 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, } EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); +static inline void pipe_truncate(struct iov_iter *i) +{ + struct pipe_inode_info *pipe = i->pipe; + if (pipe->nrbufs) { + size_t off = i->iov_offset; + int idx = i->idx; + int nrbufs = (idx - pipe->curbuf) & (pipe->buffers - 1); + if (off) { + pipe->bufs[idx].len = off - pipe->bufs[idx].offset; + idx = next_idx(idx, pipe); + nrbufs++; + } + while (pipe->nrbufs > nrbufs) { + pipe_buf_release(pipe, &pipe->bufs[idx]); + idx = next_idx(idx, pipe); + pipe->nrbufs--; + } + } +} + static void pipe_advance(struct iov_iter *i, size_t size) { struct pipe_inode_info *pipe = i->pipe; - struct pipe_buffer *buf; - int idx = i->idx; - size_t off = i->iov_offset, orig_sz; - if (unlikely(i->count < size)) size = i->count; - orig_sz = size; - if (size) { + struct pipe_buffer *buf; + size_t off = i->iov_offset, left = size; + int idx = i->idx; if (off) /* make it relative to the beginning of buffer */ - size += off - pipe->bufs[idx].offset; + left += off - pipe->bufs[idx].offset; while (1) { buf = &pipe->bufs[idx]; - if (size <= buf->len) + if (left <= buf->len) break; - size -= buf->len; + left -= buf->len; idx = next_idx(idx, pipe); } - buf->len = size; i->idx = idx; - off = i->iov_offset = buf->offset + size; + i->iov_offset = buf->offset + left; } - if (off) - idx = next_idx(idx, pipe); - if (pipe->nrbufs) { - int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); - /* [curbuf,unused) is in use. Free [idx,unused) */ - while (idx != unused) { - pipe_buf_release(pipe, &pipe->bufs[idx]); - idx = next_idx(idx, pipe); - pipe->nrbufs--; - } - } - i->count -= orig_sz; + i->count -= size; + /* ... and discard everything past that point */ + pipe_truncate(i); } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -774,6 +833,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction, size_t count) { BUG_ON(direction != ITER_PIPE); + WARN_ON(pipe->nrbufs == pipe->buffers); i->type = direction; i->pipe = pipe; i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); @@ -787,11 +847,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; - if (unlikely(i->type & ITER_PIPE)) { - if (i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) return size | i->iov_offset; return size; } @@ -806,10 +863,8 @@ EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { - unsigned long res = 0; + unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; if (unlikely(i->type & ITER_PIPE)) { WARN_ON(1); @@ -824,7 +879,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0)) ); - return res; + return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); @@ -858,6 +913,9 @@ static ssize_t pipe_get_pages(struct iov_iter *i, size_t capacity; int idx; + if (!maxsize) + return 0; + if (!sanity(i)) return -EFAULT; @@ -876,9 +934,6 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (!maxsize) - return 0; - if (unlikely(i->type & ITER_PIPE)) return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ @@ -925,6 +980,9 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i, int idx; int npages; + if (!maxsize) + return 0; + if (!sanity(i)) return -EFAULT; @@ -956,9 +1014,6 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (!maxsize) - return 0; - if (unlikely(i->type & ITER_PIPE)) return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ @@ -1008,7 +1063,7 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, } iterate_and_advance(i, bytes, v, ({ int err = 0; - next = csum_and_copy_from_user(v.iov_base, + next = csum_and_copy_from_user(v.iov_base, (to += v.iov_len) - v.iov_len, v.iov_len, 0, &err); if (!err) { @@ -1037,6 +1092,51 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, } EXPORT_SYMBOL(csum_and_copy_from_iter); +bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + sum = *csum; + if (unlikely(i->type & ITER_PIPE)) { + WARN_ON(1); + return false; + } + if (unlikely(i->count < bytes)) + return false; + iterate_all_kinds(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (err) + return false; + sum = csum_block_add(sum, next, off); + off += v.iov_len; + 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + iov_iter_advance(i, bytes); + return true; +} +EXPORT_SYMBOL(csum_and_copy_from_iter_full); + size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { @@ -1051,7 +1151,7 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, iterate_and_advance(i, bytes, v, ({ int err = 0; next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, + v.iov_base, v.iov_len, 0, &err); if (!err) { sum = csum_block_add(sum, next, off); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index f6c2c1e7779c..9a2b811966eb 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -56,7 +56,7 @@ static const char *kobject_actions[] = { * kobject_action_type - translate action string to numeric type * * @buf: buffer containing the action string, newline is ignored - * @len: length of buffer + * @count: length of buffer * @type: pointer to the location to store the action type * * Returns 0 if the action string was recognized. @@ -154,8 +154,8 @@ static void cleanup_uevent_env(struct subprocess_info *info) /** * kobject_uevent_env - send an uevent with environmental data * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * @envp_ext: pointer to environmental data * * Returns 0 if kobject_uevent_env() is completed with success or the @@ -363,8 +363,8 @@ EXPORT_SYMBOL_GPL(kobject_uevent_env); /** * kobject_uevent - notify userspace by sending an uevent * - * @action: action that is happening * @kobj: struct kobject that the action is happening to + * @action: action that is happening * * Returns 0 if kobject_uevent() is completed with success or the * corresponding error when it fails. diff --git a/lib/kstrtox.c b/lib/kstrtox.c index b8e2080c1a47..bf85e05ce858 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -17,7 +17,7 @@ #include <linux/math64.h> #include <linux/export.h> #include <linux/types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "kstrtox.h" const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..7f7bfa55eb6d 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -2,8 +2,7 @@ * Copyright 2006, Red Hat, Inc., Dave Jones * Released under the General Public License (GPL). * - * This file contains the linked list implementations for - * DEBUG_LIST. + * This file contains the linked list validation for DEBUG_LIST. */ #include <linux/export.h> @@ -13,88 +12,48 @@ #include <linux/rculist.h> /* - * Insert a new entry between two known consecutive entries. - * - * This is only for internal list manipulation where we know - * the prev/next entries already! + * Check that the data structures for the list manipulations are reasonably + * valid. Failures here indicate memory corruption (and possibly an exploit + * attempt). */ -void __list_add(struct list_head *new, - struct list_head *prev, - struct list_head *next) +bool __list_add_valid(struct list_head *new, struct list_head *prev, + struct list_head *next) { - WARN(next->prev != prev, - "list_add corruption. next->prev should be " - "prev (%p), but was %p. (next=%p).\n", + CHECK_DATA_CORRUPTION(next->prev != prev, + "list_add corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", prev, next->prev, next); - WARN(prev->next != next, - "list_add corruption. prev->next should be " - "next (%p), but was %p. (prev=%p).\n", + CHECK_DATA_CORRUPTION(prev->next != next, + "list_add corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", next, prev->next, prev); - WARN(new == prev || new == next, - "list_add double add: new=%p, prev=%p, next=%p.\n", - new, prev, next); - next->prev = new; - new->next = next; - new->prev = prev; - WRITE_ONCE(prev->next, new); + CHECK_DATA_CORRUPTION(new == prev || new == next, + "list_add double add: new=%p, prev=%p, next=%p.\n", + new, prev, next); + + return true; } -EXPORT_SYMBOL(__list_add); +EXPORT_SYMBOL(__list_add_valid); -void __list_del_entry(struct list_head *entry) +bool __list_del_entry_valid(struct list_head *entry) { struct list_head *prev, *next; prev = entry->prev; next = entry->next; - if (WARN(next == LIST_POISON1, + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %p->next is LIST_POISON1 (%p)\n", - entry, LIST_POISON1) || - WARN(prev == LIST_POISON2, + entry, LIST_POISON1); + CHECK_DATA_CORRUPTION(prev == LIST_POISON2, "list_del corruption, %p->prev is LIST_POISON2 (%p)\n", - entry, LIST_POISON2) || - WARN(prev->next != entry, - "list_del corruption. prev->next should be %p, " - "but was %p\n", entry, prev->next) || - WARN(next->prev != entry, - "list_del corruption. next->prev should be %p, " - "but was %p\n", entry, next->prev)) - return; - - __list_del(prev, next); -} -EXPORT_SYMBOL(__list_del_entry); + entry, LIST_POISON2); + CHECK_DATA_CORRUPTION(prev->next != entry, + "list_del corruption. prev->next should be %p, but was %p\n", + entry, prev->next); + CHECK_DATA_CORRUPTION(next->prev != entry, + "list_del corruption. next->prev should be %p, but was %p\n", + entry, next->prev); + return true; -/** - * list_del - deletes entry from list. - * @entry: the element to delete from the list. - * Note: list_empty on entry does not return true after this, the entry is - * in an undefined state. - */ -void list_del(struct list_head *entry) -{ - __list_del_entry(entry); - entry->next = LIST_POISON1; - entry->prev = LIST_POISON2; -} -EXPORT_SYMBOL(list_del); - -/* - * RCU variants. - */ -void __list_add_rcu(struct list_head *new, - struct list_head *prev, struct list_head *next) -{ - WARN(next->prev != prev, - "list_add_rcu corruption. next->prev should be prev (%p), but was %p. (next=%p).\n", - prev, next->prev, next); - WARN(prev->next != next, - "list_add_rcu corruption. prev->next should be next (%p), but was %p. (prev=%p).\n", - next, prev->next, prev); - new->next = next; - new->prev = prev; - rcu_assign_pointer(list_next_rcu(prev), new); - next->prev = new; } -EXPORT_SYMBOL(__list_add_rcu); +EXPORT_SYMBOL(__list_del_entry_valid); diff --git a/lib/lockref.c b/lib/lockref.c index 5a92189ad711..c4bfcb8836cd 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -20,7 +20,7 @@ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } \ } while (0) diff --git a/lib/nlattr.c b/lib/nlattr.c index fce1e9afc6d9..b42b8577fc23 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -14,7 +14,7 @@ #include <linux/types.h> #include <net/netlink.h> -static const u16 nla_attr_minlen[NLA_TYPE_MAX+1] = { +static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), diff --git a/lib/parser.c b/lib/parser.c index b6d11631231b..3278958b472a 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -152,6 +152,36 @@ static int match_number(substring_t *s, int *result, int base) } /** + * match_u64int: scan a number in the given base from a substring_t + * @s: substring to be scanned + * @result: resulting u64 on success + * @base: base to use when converting string + * + * Description: Given a &substring_t and a base, attempts to parse the substring + * as a number in that base. On success, sets @result to the integer represented + * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +static int match_u64int(substring_t *s, u64 *result, int base) +{ + char *buf; + int ret; + u64 val; + size_t len = s->to - s->from; + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, s->from, len); + buf[len] = '\0'; + + ret = kstrtoull(buf, base, &val); + if (!ret) + *result = val; + kfree(buf); + return ret; +} + +/** * match_int: - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success @@ -167,6 +197,23 @@ int match_int(substring_t *s, int *result) EXPORT_SYMBOL(match_int); /** + * match_u64: - scan a decimal representation of a u64 from + * a substring_t + * @s: substring_t to be scanned + * @result: resulting unsigned long long on success + * + * Description: Attempts to parse the &substring_t @s as a long decimal + * integer. On success, sets @result to the integer represented by the + * string and returns 0. + * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + */ +int match_u64(substring_t *s, u64 *result) +{ + return match_u64int(s, result, 0); +} +EXPORT_SYMBOL(match_u64); + +/** * match_octal: - scan an octal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 72d36113ccaa..c8cebb137076 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -158,25 +158,21 @@ EXPORT_SYMBOL(percpu_counter_destroy); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); -static void compute_batch_value(void) +static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); + return 0; } -static int percpu_counter_hotcpu_callback(struct notifier_block *nb, - unsigned long action, void *hcpu) +static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU - unsigned int cpu; struct percpu_counter *fbc; - compute_batch_value(); - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) - return NOTIFY_OK; + compute_batch_value(cpu); - cpu = (unsigned long)hcpu; spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; @@ -190,7 +186,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, } spin_unlock_irq(&percpu_counters_lock); #endif - return NOTIFY_OK; + return 0; } /* @@ -222,8 +218,15 @@ EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { - compute_batch_value(); - hotcpu_notifier(percpu_counter_hotcpu_callback, 0); + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", + compute_batch_value, NULL); + WARN_ON(ret < 0); + ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, + "lib/percpu_cnt:dead", NULL, + percpu_counter_cpu_dead); + WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e6d552c40dd..84812a9fb16f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -22,6 +22,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> @@ -30,7 +31,6 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <linux/kmemleak.h> -#include <linux/notifier.h> #include <linux/cpu.h> #include <linux/string.h> #include <linux/bitops.h> @@ -69,6 +69,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -191,13 +196,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long -radix_tree_find_next_bit(const unsigned long *addr, - unsigned long size, unsigned long offset) +radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, + unsigned long offset) { - if (!__builtin_constant_p(size)) - return find_next_bit(addr, size, offset); + const unsigned long *addr = node->tags[tag]; - if (offset < size) { + if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; @@ -205,14 +209,32 @@ radix_tree_find_next_bit(const unsigned long *addr, if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); - while (offset < size) { + while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } - return size; + return RADIX_TREE_MAP_SIZE; +} + +static unsigned int iter_offset(const struct radix_tree_iter *iter) +{ + return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; +} + +/* + * The maximum index which can be stored in a radix tree + */ +static inline unsigned long shift_maxindex(unsigned int shift) +{ + return (RADIX_TREE_MAP_SIZE << shift) - 1; +} + +static inline unsigned long node_maxindex(struct radix_tree_node *node) +{ + return shift_maxindex(node->shift); } #ifndef __KERNEL__ @@ -220,10 +242,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", - node, node->offset, + pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", + node, node->offset, index, index | node_maxindex(node), + node->parent, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -231,14 +254,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) void *entry = node->slots[i]; if (!entry) continue; - if (is_sibling_entry(node, entry)) { - pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", - entry, i, - *(void **)entry_to_node(entry), - first, last); + if (entry == RADIX_TREE_RETRY) { + pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", + i, first, last, node); } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %ld-%ld\n", - entry, i, first, last); + pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", + entry, i, first, last, node); + } else if (is_sibling_entry(node, entry)) { + pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", + entry, i, first, last, node, + *(void **)entry_to_node(entry)); } else { dump_node(entry_to_node(entry), first); } @@ -262,7 +287,10 @@ static void radix_tree_dump(struct radix_tree_root *root) * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * -radix_tree_node_alloc(struct radix_tree_root *root) +radix_tree_node_alloc(struct radix_tree_root *root, + struct radix_tree_node *parent, + unsigned int shift, unsigned int offset, + unsigned int count, unsigned int exceptional) { struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); @@ -307,6 +335,13 @@ radix_tree_node_alloc(struct radix_tree_root *root) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); + if (ret) { + ret->parent = parent; + ret->shift = shift; + ret->offset = offset; + ret->count = count; + ret->exceptional = exceptional; + } return ret; } @@ -314,18 +349,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); - int i; /* - * must only free zeroed nodes into the slab. radix_tree_shrink - * can leave us with a non-NULL entry in the first slot, so clear - * that here to make sure. + * Must only free zeroed nodes into the slab. We can be left with + * non-NULL entries by radix_tree_free_nodes, so clear the entries + * and tags here. */ - for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) - tag_clear(node, i, 0); - - node->slots[0] = NULL; - node->count = 0; + memset(node->slots, 0, sizeof(node->slots)); + memset(node->tags, 0, sizeof(node->tags)); + INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } @@ -345,7 +377,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask, int nr) +static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -411,6 +443,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* + * Preload with enough objects to ensure that we can split a single entry + * of order @old_order into many entries of size @new_order + */ +int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, + gfp_t gfp_mask) +{ + unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); + unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - + (new_order / RADIX_TREE_MAP_SHIFT); + unsigned nr = 0; + + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); + BUG_ON(new_order >= old_order); + + while (layers--) + nr = nr * RADIX_TREE_MAP_SIZE + 1; + return __radix_tree_preload(gfp_mask, top * nr); +} +#endif + /* * The same as function above, but preload number of nodes required to insert * (1 << order) continuous naturally-aligned elements. @@ -456,19 +510,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) return __radix_tree_preload(gfp_mask, nr_nodes); } -/* - * The maximum index which can be stored in a radix tree - */ -static inline unsigned long shift_maxindex(unsigned int shift) -{ - return (RADIX_TREE_MAP_SIZE << shift) - 1; -} - -static inline unsigned long node_maxindex(struct radix_tree_node *node) -{ - return shift_maxindex(node->shift); -} - static unsigned radix_tree_load_root(struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { @@ -506,8 +547,8 @@ static int radix_tree_extend(struct radix_tree_root *root, goto out; do { - struct radix_tree_node *node = radix_tree_node_alloc(root); - + struct radix_tree_node *node = radix_tree_node_alloc(root, + NULL, shift, 0, 1, 0); if (!node) return -ENOMEM; @@ -518,12 +559,12 @@ static int radix_tree_extend(struct radix_tree_root *root, } BUG_ON(shift > BITS_PER_LONG); - node->shift = shift; - node->offset = 0; - node->count = 1; - node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else if (radix_tree_exceptional_entry(slot)) { + /* Moving an exceptional root->rnode to a node */ + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -534,6 +575,106 @@ out: } /** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline void radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) +{ + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + node->count = 0; + if (!radix_tree_is_internal_node(child)) { + node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } + + WARN_ON_ONCE(!list_empty(&node->private_list)); + radix_tree_node_free(node); + } +} + +static void delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) +{ + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + radix_tree_shrink(root, update_node, private); + return; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + WARN_ON_ONCE(!list_empty(&node->private_list)); + radix_tree_node_free(node); + + node = parent; + } while (node); +} + +/** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key @@ -563,26 +704,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ + if (order > 0 && max == ((1UL << order) - 1)) + max++; if (max > maxindex) { int error = radix_tree_extend(root, max, shift); if (error < 0) return error; shift = error; child = root->rnode; - if (order == shift) - shift += RADIX_TREE_MAP_SHIFT; } while (shift > order) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ - child = radix_tree_node_alloc(root); + child = radix_tree_node_alloc(root, node, shift, + offset, 0, 0); if (!child) return -ENOMEM; - child->shift = shift; - child->offset = offset; - child->parent = node; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; @@ -595,31 +734,126 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = &node->slots[offset]; } + if (nodep) + *nodep = node; + if (slotp) + *slotp = slot; + return 0; +} + #ifdef CONFIG_RADIX_TREE_MULTIORDER - /* Insert pointers to the canonical entry */ - if (order > shift) { - unsigned i, n = 1 << (order - shift); +/* + * Free any nodes below this node. The tree is presumed to not need + * shrinking, and any user data in the tree is presumed to not need a + * destructor called on it. If we need to add a destructor, we can + * add that functionality later. Note that we may not clear tags or + * slots from the tree as an RCU walker may still have a pointer into + * this subtree. We could replace the entries with RADIX_TREE_RETRY, + * but we'll still have to clear those in rcu_free. + */ +static void radix_tree_free_nodes(struct radix_tree_node *node) +{ + unsigned offset = 0; + struct radix_tree_node *child = entry_to_node(node); + + for (;;) { + void *entry = child->slots[offset]; + if (radix_tree_is_internal_node(entry) && + !is_sibling_entry(child, entry)) { + child = entry_to_node(entry); + offset = 0; + continue; + } + offset++; + while (offset == RADIX_TREE_MAP_SIZE) { + struct radix_tree_node *old = child; + offset = child->offset + 1; + child = child->parent; + WARN_ON_ONCE(!list_empty(&old->private_list)); + radix_tree_node_free(old); + if (old == entry_to_node(node)) + return; + } + } +} + +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + struct radix_tree_node *child; + unsigned i, n, tag, offset, tags = 0; + + if (node) { + if (order > node->shift) + n = 1 << (order - node->shift); + else + n = 1; + offset = get_slot_offset(node, slot); + } else { + n = 1; + offset = 0; + } + + if (n > 1) { offset = offset & ~(n - 1); slot = &node->slots[offset]; - child = node_to_entry(slot); - for (i = 0; i < n; i++) { - if (slot[i]) + } + child = node_to_entry(slot); + + for (i = 0; i < n; i++) { + if (slot[i]) { + if (replace) { + node->count--; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(node, tag, offset + i)) + tags |= 1 << tag; + } else return -EEXIST; } + } - for (i = 1; i < n; i++) { + for (i = 0; i < n; i++) { + struct radix_tree_node *old = slot[i]; + if (i) { rcu_assign_pointer(slot[i], child); - node->count++; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_clear(node, tag, offset + i); + } else { + rcu_assign_pointer(slot[i], item); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); } + if (radix_tree_is_internal_node(old) && + !is_sibling_entry(node, old) && + (old != RADIX_TREE_RETRY)) + radix_tree_free_nodes(old); + if (radix_tree_exceptional_entry(old)) + node->exceptional--; } -#endif - - if (nodep) - *nodep = node; - if (slotp) - *slotp = slot; - return 0; + if (node) { + node->count += n; + if (radix_tree_exceptional_entry(item)) + node->exceptional += n; + } + return n; +} +#else +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + if (*slot) + return -EEXIST; + rcu_assign_pointer(*slot, item); + if (node) { + node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; + } + return 1; } +#endif /** * __radix_tree_insert - insert into a radix tree @@ -642,13 +876,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, error = __radix_tree_create(root, index, order, &node, &slot); if (error) return error; - if (*slot != NULL) - return -EEXIST; - rcu_assign_pointer(*slot, item); + + error = insert_entries(node, slot, item, order, false); + if (error < 0) + return error; if (node) { unsigned offset = get_slot_offset(node, slot); - node->count++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +980,287 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static inline int slot_count(struct radix_tree_node *node, + void **slot) +{ + int n = 1; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + n++; + } +#endif + return n; +} + +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) +{ + void *old = rcu_dereference_raw(*slot); + int count, exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + + count = !!item - !!old; + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); + + if (node) { + node->count += count; + if (exceptional) { + exceptional *= slot_count(node, slot); + node->exceptional += exceptional; + } + } + + rcu_assign_pointer(*slot, item); +} + +static inline void delete_sibling_entries(struct radix_tree_node *node, + void **slot) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + bool exceptional = radix_tree_exceptional_entry(*slot); + void *ptr = node_to_entry(slot); + unsigned offset = get_slot_offset(node, slot); + int i; + + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + if (exceptional) + node->exceptional--; + } +#endif +} + +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) +{ + if (!item) + delete_sibling_entries(node, slot); + /* + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); + + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace() or + * radix_tree_iter_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + +/** + * radix_tree_iter_replace - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_split() and radix_tree_for_each_slot(). + * Caller must hold tree write locked across split and replacement. + */ +void radix_tree_iter_replace(struct radix_tree_root *root, + const struct radix_tree_iter *iter, void **slot, void *item) +{ + __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/** + * radix_tree_join - replace multiple entries with one multiorder entry + * @root: radix tree root + * @index: an index inside the new entry + * @order: order of the new entry + * @item: new entry + * + * Call this function to replace several entries with one larger entry. + * The existing entries are presumed to not need freeing as a result of + * this call. + * + * The replacement entry will have all the tags set on it that were set + * on any of the entries it is replacing. + */ +int radix_tree_join(struct radix_tree_root *root, unsigned long index, + unsigned order, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_internal_node(item)); + + error = __radix_tree_create(root, index, order, &node, &slot); + if (!error) + error = insert_entries(node, slot, item, order, true); + if (error > 0) + error = 0; + + return error; +} + +/** + * radix_tree_split - Split an entry into smaller entries + * @root: radix tree root + * @index: An index within the large entry + * @order: Order of new entries + * + * Call this function as the first step in replacing a multiorder entry + * with several entries of lower order. After this function returns, + * loop over the relevant portion of the tree using radix_tree_for_each_slot() + * and call radix_tree_iter_replace() to set up each new entry. + * + * The tags from this entry are replicated to all the new entries. + * + * The radix tree should be locked against modification during the entire + * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which + * should prompt RCU walkers to restart the lookup from the root. + */ +int radix_tree_split(struct radix_tree_root *root, unsigned long index, + unsigned order) +{ + struct radix_tree_node *parent, *node, *child; + void **slot; + unsigned int offset, end; + unsigned n, tag, tags = 0; + + if (!__radix_tree_lookup(root, index, &parent, &slot)) + return -ENOENT; + if (!parent) + return -ENOENT; + + offset = get_slot_offset(parent, slot); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(parent, tag, offset)) + tags |= 1 << tag; + + for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { + if (!is_sibling_entry(parent, parent->slots[end])) + break; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(parent, tag, end); + /* rcu_assign_pointer ensures tags are set before RETRY */ + rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); + } + rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); + parent->exceptional -= (end - offset); + + if (order == parent->shift) + return 0; + if (order > parent->shift) { + while (offset < end) + offset += insert_entries(parent, &parent->slots[offset], + RADIX_TREE_RETRY, order, true); + return 0; + } + + node = parent; + + for (;;) { + if (node->shift > order) { + child = radix_tree_node_alloc(root, node, + node->shift - RADIX_TREE_MAP_SHIFT, + offset, 0, 0); + if (!child) + goto nomem; + if (node != parent) { + node->count++; + node->slots[offset] = node_to_entry(child); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + } + + node = child; + offset = 0; + continue; + } + + n = insert_entries(node, &node->slots[offset], + RADIX_TREE_RETRY, order, false); + BUG_ON(n > RADIX_TREE_MAP_SIZE); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + offset += n; + + while (offset == RADIX_TREE_MAP_SIZE) { + if (node == parent) + break; + offset = node->offset; + child = node; + node = node->parent; + rcu_assign_pointer(node->slots[offset], + node_to_entry(child)); + offset++; + } + if ((node == parent) && (offset == end)) + return 0; + } + + nomem: + /* Shouldn't happen; did user forget to preload? */ + /* TODO: free all the allocated nodes */ + WARN_ON(1); + return -ENOMEM; +} +#endif + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -807,6 +1322,34 @@ static void node_tag_clear(struct radix_tree_root *root, root_tag_clear(root, tag); } +static void node_tag_set(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (tag_get(node, tag, offset)) + return; + tag_set(node, tag, offset); + offset = node->offset; + node = node->parent; + } + + if (!root_tag_get(root, tag)) + root_tag_set(root, tag); +} + +/** + * radix_tree_iter_tag_set - set a tag on the current iterator entry + * @root: radix tree root + * @iter: iterator state + * @tag: tag to set + */ +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag) +{ + node_tag_set(root, iter->node, tag, iter_offset(iter)); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -902,6 +1445,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +/* Construct iter->tags bit-mask from node->tags[tag] array */ +static void set_iter_tags(struct radix_tree_iter *iter, + struct radix_tree_node *node, unsigned offset, + unsigned tag) +{ + unsigned tag_long = offset / BITS_PER_LONG; + unsigned tag_bit = offset % BITS_PER_LONG; + + iter->tags = node->tags[tag][tag_long] >> tag_bit; + + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); + } +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = skip_siblings(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + unsigned long next_index; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + iter->node = node; + iter->shift = node->shift; + + if (flags & RADIX_TREE_ITER_TAGGED) { + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + iter->index = __radix_tree_iter_add(iter, offset); + set_iter_tags(iter, node, offset, tag); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + iter->index = __radix_tree_iter_add(iter, offset); + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + next_index = (iter->index | shift_maxindex(iter->shift)) + 1; + if (next_index < iter->next_index) + iter->next_index = next_index; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#else +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + return slot; +} +#endif + +void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + skip_siblings(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_resume); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -927,7 +1585,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop - * contiguous iterating, and forbid swithing to the next chunk. + * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) @@ -945,6 +1603,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; + iter->node = NULL; __set_iter_shift(iter, 0); return (void **)&root->rnode; } @@ -960,9 +1619,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, return NULL; if (flags & RADIX_TREE_ITER_TAGGED) - offset = radix_tree_find_next_bit( - node->tags[tag], - RADIX_TREE_MAP_SIZE, + offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { @@ -982,154 +1639,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, child = rcu_dereference_raw(node->slots[offset]); } - if ((child == NULL) || (child == RADIX_TREE_RETRY)) + if (!child) goto restart; + if (child == RADIX_TREE_RETRY) + break; } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); iter->next_index = (index | node_maxindex(node)) + 1; + iter->node = node; __set_iter_shift(iter, node->shift); - /* Construct iter->tags bit-mask from node->tags[tag] array */ - if (flags & RADIX_TREE_ITER_TAGGED) { - unsigned tag_long, tag_bit; - - tag_long = offset / BITS_PER_LONG; - tag_bit = offset % BITS_PER_LONG; - iter->tags = node->tags[tag][tag_long] >> tag_bit; - /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ - if (tag_long < RADIX_TREE_TAG_LONGS - 1) { - /* Pick tags from next element */ - if (tag_bit) - iter->tags |= node->tags[tag][tag_long + 1] << - (BITS_PER_LONG - tag_bit); - /* Clip chunk size, here only BITS_PER_LONG tags */ - iter->next_index = index + BITS_PER_LONG; - } - } + if (flags & RADIX_TREE_ITER_TAGGED) + set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** - * radix_tree_range_tag_if_tagged - for each item in given range set given - * tag if item has another tag set - * @root: radix tree root - * @first_indexp: pointer to a starting index of a range to scan - * @last_index: last index of a range to scan - * @nr_to_tag: maximum number items to tag - * @iftag: tag index to test - * @settag: tag index to set if tested tag is set - * - * This function scans range of radix tree from first_index to last_index - * (inclusive). For each item in the range if iftag is set, the function sets - * also settag. The function stops either after tagging nr_to_tag items or - * after reaching last_index. - * - * The tags must be set from the leaf level only and propagated back up the - * path to the root. We must do this so that we resolve the full path before - * setting any tags on intermediate nodes. If we set tags as we descend, then - * we can get to the leaf node and find that the index that has the iftag - * set is outside the range we are scanning. This reults in dangling tags and - * can lead to problems with later tag operations (e.g. livelocks on lookups). - * - * The function returns the number of leaves where the tag was set and sets - * *first_indexp to the first unscanned index. - * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must - * be prepared to handle that. - */ -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int iftag, unsigned int settag) -{ - struct radix_tree_node *parent, *node, *child; - unsigned long maxindex; - unsigned long tagged = 0; - unsigned long index = *first_indexp; - - radix_tree_load_root(root, &child, &maxindex); - last_index = min(last_index, maxindex); - if (index > last_index) - return 0; - if (!nr_to_tag) - return 0; - if (!root_tag_get(root, iftag)) { - *first_indexp = last_index + 1; - return 0; - } - if (!radix_tree_is_internal_node(child)) { - *first_indexp = last_index + 1; - root_tag_set(root, settag); - return 1; - } - - node = entry_to_node(child); - - for (;;) { - unsigned offset = radix_tree_descend(node, &child, index); - if (!child) - goto next; - if (!tag_get(node, iftag, offset)) - goto next; - /* Sibling slots never have tags set on them */ - if (radix_tree_is_internal_node(child)) { - node = entry_to_node(child); - continue; - } - - /* tag the leaf */ - tagged++; - tag_set(node, settag, offset); - - /* walk back up the path tagging interior nodes */ - parent = node; - for (;;) { - offset = parent->offset; - parent = parent->parent; - if (!parent) - break; - /* stop if we find a node with the tag already set */ - if (tag_get(parent, settag, offset)) - break; - tag_set(parent, settag, offset); - } - next: - /* Go to next entry in node */ - index = ((index >> node->shift) + 1) << node->shift; - /* Overflow can happen when last_index is ~0UL... */ - if (index > last_index || !index) - break; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - while (offset == 0) { - /* - * We've fully scanned this node. Go up. Because - * last_index is guaranteed to be in the tree, what - * we do below cannot wander astray. - */ - node = node->parent; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - } - if (is_sibling_entry(node, node->slots[offset])) - goto next; - if (tagged >= nr_to_tag) - break; - } - /* - * We need not to tag the root tag if there is no tag which is set with - * settag within the range from *first_indexp to last_index. - */ - if (tagged > 0) - root_tag_set(root, settag); - *first_indexp = index; - - return tagged; -} -EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - -/** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed @@ -1294,229 +1823,23 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) -#include <linux/sched.h> /* for cond_resched() */ - -struct locate_info { - unsigned long found_index; - bool stop; -}; - -/* - * This linear search is at present only useful to shmem_unuse_inode(). - */ -static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, struct locate_info *info) -{ - unsigned long i; - - do { - unsigned int shift = slot->shift; - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK; - i < RADIX_TREE_MAP_SIZE; - i++, index += (1UL << shift)) { - struct radix_tree_node *node = - rcu_dereference_raw(slot->slots[i]); - if (node == RADIX_TREE_RETRY) - goto out; - if (!radix_tree_is_internal_node(node)) { - if (node == item) { - info->found_index = index; - info->stop = true; - goto out; - } - continue; - } - node = entry_to_node(node); - if (is_sibling_entry(slot, node)) - continue; - slot = node; - break; - } - } while (i < RADIX_TREE_MAP_SIZE); - -out: - if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) - info->stop = true; - return index; -} - -/** - * radix_tree_locate_item - search through radix tree for item - * @root: radix tree root - * @item: item to be found - * - * Returns index where item was found, or -1 if not found. - * Caller must hold no lock (since this time-consuming function needs - * to be preemptible), and must check afterwards if item is still there. - */ -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = 0; - struct locate_info info = { - .found_index = -1, - .stop = false, - }; - - do { - rcu_read_lock(); - node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_internal_node(node)) { - rcu_read_unlock(); - if (node == item) - info.found_index = 0; - break; - } - - node = entry_to_node(node); - - max_index = node_maxindex(node); - if (cur_index > max_index) { - rcu_read_unlock(); - break; - } - - cur_index = __locate(node, item, cur_index, &info); - rcu_read_unlock(); - cond_resched(); - } while (!info.stop && cur_index <= max_index); - - return info.found_index; -} -#else -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - return -1; -} -#endif /* CONFIG_SHMEM && CONFIG_SWAP */ - -/** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root * @node: node containing @index + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) -{ - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; -} - -static inline void delete_sibling_entries(struct radix_tree_node *node, - void *ptr, unsigned offset) +void __radix_tree_delete_node(struct radix_tree_root *root, + struct radix_tree_node *node, + radix_tree_update_node_t update_node, + void *private) { -#ifdef CONFIG_RADIX_TREE_MULTIORDER - int i; - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr) - break; - node->slots[offset + i] = NULL; - node->count--; - } -#endif + delete_node(root, node, update_node, private); } /** @@ -1558,11 +1881,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); - delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } @@ -1642,32 +1961,31 @@ static __init void radix_tree_init_maxnodes(void) } } -static int radix_tree_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int radix_tree_cpu_dead(unsigned int cpu) { - int cpu = (long)hcpu; struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - rtp = &per_cpu(radix_tree_preloads, cpu); - while (rtp->nr) { - node = rtp->nodes; - rtp->nodes = node->private_data; - kmem_cache_free(radix_tree_node_cachep, node); - rtp->nr--; - } + rtp = &per_cpu(radix_tree_preloads, cpu); + while (rtp->nr) { + node = rtp->nodes; + rtp->nodes = node->private_data; + kmem_cache_free(radix_tree_node_cachep, node); + rtp->nr--; } - return NOTIFY_OK; + return 0; } void __init radix_tree_init(void) { + int ret; radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); radix_tree_init_maxnodes(); - hotcpu_notifier(radix_tree_callback, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", + NULL, radix_tree_cpu_dead); + WARN_ON(ret < 0); } diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c index 76734004358d..20bca3d44f67 100644 --- a/lib/raid6/avx2.c +++ b/lib/raid6/avx2.c @@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 32) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x1 = { raid6_avx21_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx21_xor_syndrome, raid6_have_avx2, "avx2x1", 1 /* Has cache hints */ @@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x2 = { raid6_avx22_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx22_xor_syndrome, raid6_have_avx2, "avx2x2", 1 /* Has cache hints */ @@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) kernel_fpu_end(); } +static void raid6_avx24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); + asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); + asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + asm volatile("vpxor %ymm12,%ymm10,%ymm10"); + asm volatile("vpxor %ymm14,%ymm11,%ymm11"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" + :: "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" + :: "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+64])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); + asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + const struct raid6_calls raid6_avx2x4 = { raid6_avx24_gen_syndrome, - NULL, /* XOR not yet implemented */ + raid6_avx24_xor_syndrome, raid6_have_avx2, "avx2x4", 1 /* Has cache hints */ diff --git a/lib/rbtree.c b/lib/rbtree.c index eb8a19fee110..1f8b112a7c35 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); @@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ + /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); @@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ + /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 22e13a0e19d7..a8d74a733a38 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -53,7 +53,7 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) -int swiotlb_force; +enum swiotlb_force swiotlb_force; /* * Used to do a quick range check in swiotlb_tbl_unmap_single and @@ -83,6 +83,12 @@ static unsigned int *io_tlb_list; static unsigned int io_tlb_index; /* + * Max segment that we can provide which (if pages are contingous) will + * not be bounced (unless SWIOTLB_FORCE is set). + */ +unsigned int max_segment; + +/* * We need to save away the original address corresponding to a mapped entry * for the sync operations. */ @@ -106,8 +112,12 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; - if (!strcmp(str, "force")) - swiotlb_force = 1; + if (!strcmp(str, "force")) { + swiotlb_force = SWIOTLB_FORCE; + } else if (!strcmp(str, "noforce")) { + swiotlb_force = SWIOTLB_NO_FORCE; + io_tlb_nslabs = 1; + } return 0; } @@ -120,6 +130,20 @@ unsigned long swiotlb_nr_tbl(void) } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); +unsigned int swiotlb_max_segment(void) +{ + return max_segment; +} +EXPORT_SYMBOL_GPL(swiotlb_max_segment); + +void swiotlb_set_max_segment(unsigned int val) +{ + if (swiotlb_force == SWIOTLB_FORCE) + max_segment = 1; + else + max_segment = rounddown(val, PAGE_SIZE); +} + /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) unsigned long swiotlb_size_or_default(void) @@ -201,6 +225,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) if (verbose) swiotlb_print_info(); + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); return 0; } @@ -279,6 +304,7 @@ swiotlb_late_init_with_default_size(size_t default_size) rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); if (rc) free_pages((unsigned long)vstart, order); + return rc; } @@ -333,6 +359,8 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) late_alloc = 1; + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + return 0; cleanup4: @@ -347,6 +375,7 @@ cleanup2: io_tlb_end = 0; io_tlb_start = 0; io_tlb_nslabs = 0; + max_segment = 0; return -ENOMEM; } @@ -375,6 +404,7 @@ void __init swiotlb_free(void) PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; + max_segment = 0; } int is_swiotlb_buffer(phys_addr_t paddr) @@ -425,7 +455,8 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, phys_addr_t orig_addr, size_t size, - enum dma_data_direction dir) + enum dma_data_direction dir, + unsigned long attrs) { unsigned long flags; phys_addr_t tlb_addr; @@ -452,11 +483,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); /* - * For mappings greater than a page, we limit the stride (and - * hence alignment) to a page size. + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. */ nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size > PAGE_SIZE) + if (size >= PAGE_SIZE) stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); else stride = 1; @@ -526,7 +557,8 @@ found: */ for (i = 0; i < nslots; i++) io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); return tlb_addr; @@ -539,18 +571,27 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); static phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir) + enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + dma_addr_t start_dma_addr; - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); + if (swiotlb_force == SWIOTLB_NO_FORCE) { + dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", + &phys); + return SWIOTLB_MAP_ERROR; + } + + start_dma_addr = phys_to_dma(hwdev, io_tlb_start); + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, + dir, attrs); } /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + unsigned long attrs) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -561,6 +602,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * First, sync the memory before unmapping the entry */ if (orig_addr != INVALID_PHYS_ADDR && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); @@ -654,7 +696,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ - phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE); + phys_addr_t paddr = map_single(hwdev, 0, size, + DMA_FROM_DEVICE, 0); if (paddr == SWIOTLB_MAP_ERROR) goto err_warn; @@ -667,9 +710,13 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dma_mask, (unsigned long long)dev_addr); - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ + /* + * DMA_TO_DEVICE to avoid memcpy in unmap_single. + * The DMA_ATTR_SKIP_CPU_SYNC is optional. + */ swiotlb_tbl_unmap_single(hwdev, paddr, - size, DMA_TO_DEVICE); + size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); goto err_warn; } } @@ -698,8 +745,12 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, if (!is_swiotlb_buffer(paddr)) free_pages((unsigned long)vaddr, get_order(size)); else - /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ - swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE); + /* + * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -707,6 +758,9 @@ static void swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, int do_panic) { + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + /* * Ran out of IOMMU space for this operation. This is very bad. * Unfortunately the drivers cannot handle this operation properly. @@ -714,8 +768,8 @@ swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, * When the mapping is small enough return a static buffer to limit * the damage, or panic when the transfer is too big. */ - printk(KERN_ERR "DMA: Out of SW-IOMMU space for %zu bytes at " - "device %s\n", size, dev ? dev_name(dev) : "?"); + dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", + size); if (size <= io_tlb_overflow || !do_panic) return; @@ -749,13 +803,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (dma_capable(dev, dev_addr, size) && !swiotlb_force) + if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) return dev_addr; trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir); + map = map_single(dev, phys, size, dir, attrs); if (map == SWIOTLB_MAP_ERROR) { swiotlb_full(dev, size, dir, 1); return phys_to_dma(dev, io_tlb_overflow_buffer); @@ -764,12 +818,13 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, dev_addr = phys_to_dma(dev, map); /* Ensure that the address returned is DMA'ble */ - if (!dma_capable(dev, dev_addr, size)) { - swiotlb_tbl_unmap_single(dev, map, size, dir); - return phys_to_dma(dev, io_tlb_overflow_buffer); - } + if (dma_capable(dev, dev_addr, size)) + return dev_addr; - return dev_addr; + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + + return phys_to_dma(dev, io_tlb_overflow_buffer); } EXPORT_SYMBOL_GPL(swiotlb_map_page); @@ -782,14 +837,15 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * whatever the device wrote there. */ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) + size_t size, enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); return; } @@ -809,7 +865,7 @@ void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - unmap_single(hwdev, dev_addr, size, dir); + unmap_single(hwdev, dev_addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -888,14 +944,15 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, phys_addr_t paddr = sg_phys(sg); dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - if (swiotlb_force || + if (swiotlb_force == SWIOTLB_FORCE || !dma_capable(hwdev, dev_addr, sg->length)) { phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir); + sg->length, dir, attrs); if (map == SWIOTLB_MAP_ERROR) { /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); + attrs |= DMA_ATTR_SKIP_CPU_SYNC; swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); sg_dma_len(sgl) = 0; @@ -910,14 +967,6 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, } EXPORT_SYMBOL(swiotlb_map_sg_attrs); -int -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, 0); -} -EXPORT_SYMBOL(swiotlb_map_sg); - /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules * concerning calls here are the same as for swiotlb_unmap_page() above. @@ -933,19 +982,11 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); - + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + attrs); } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); -void -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ - return swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, 0); -} -EXPORT_SYMBOL(swiotlb_unmap_sg); - /* * Make physical memory consistent for a set of streaming mode DMA translations * after a transfer. diff --git a/lib/timerqueue.c b/lib/timerqueue.c index 782ae8ca2c06..adc6ee0a5126 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -48,7 +48,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) while (*p) { parent = *p; ptr = rb_entry(parent, struct timerqueue_node, node); - if (node->expires.tv64 < ptr->expires.tv64) + if (node->expires < ptr->expires) p = &(*p)->rb_left; else p = &(*p)->rb_right; @@ -56,7 +56,7 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); - if (!head->next || node->expires.tv64 < head->next->expires.tv64) { + if (!head->next || node->expires < head->next->expires) { head->next = node; return true; } |