From 2261cc627f5453004042b4f694612edae27e492e Mon Sep 17 00:00:00 2001 From: Shawn Guo <shawn.guo@linaro.org> Date: Wed, 15 Feb 2012 10:47:42 -0800 Subject: dt: add empty of_find_compatible_node function Add empty of_find_compatible_node function for !CONFIG_OF build. Signed-off-by: Shawn Guo <shawn.guo@linaro.org> Signed-off-by: Grant Likely <grant.likely@secretlab.ca> --- include/linux/of.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index a75a831e2057..92cf6ad35e0e 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -281,6 +281,14 @@ static inline struct property *of_find_property(const struct device_node *np, return NULL; } +static inline struct device_node *of_find_compatible_node( + struct device_node *from, + const char *type, + const char *compat) +{ + return NULL; +} + static inline int of_property_read_u32_array(const struct device_node *np, const char *propname, u32 *out_values, size_t sz) -- cgit v1.2.3-70-g09d2 From 7d96b3e55ad45ebe4ff1a1daad27ac1fff8682ec Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov <khlebnikov@openvz.org> Date: Sun, 19 Feb 2012 18:29:11 +0400 Subject: percpu: fix generic definition of __this_cpu_add_and_return() This patch adds missed "__" into function prefix. Otherwise on all archectures (except x86) it expands to irq/preemtion-safe variant: _this_cpu_generic_add_return(), which do extra irq-save/irq-restore. Optimal generic implementation is __this_cpu_generic_add_return(). Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Acked-by: Christoph Lameter <cl@linux.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/percpu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 32cd1f67462e..3b609eb9cd7d 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -718,7 +718,8 @@ do { \ # ifndef __this_cpu_add_return_8 # define __this_cpu_add_return_8(pcp, val) __this_cpu_generic_add_return(pcp, val) # endif -# define __this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val) +# define __this_cpu_add_return(pcp, val) \ + __pcpu_size_call_return2(__this_cpu_add_return_, pcp, val) #endif #define __this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(val)) -- cgit v1.2.3-70-g09d2 From e920d5971d706290c5a6281f719e16c25021f964 Mon Sep 17 00:00:00 2001 From: Ming Lei <tom.leiming@gmail.com> Date: Wed, 15 Feb 2012 16:54:38 +0800 Subject: percpu: use raw_local_irq_* in _this_cpu op It doesn't make sense to trace irq off or do irq flags lock proving inside 'this_cpu' operations, so replace local_irq_* with raw_local_irq_* in 'this_cpu' op. Also the patch fixes onelockdep warning[1] by the replacement, see below: In commit: 933393f58fef9963eac61db8093689544e29a600(percpu: Remove irqsafe_cpu_xxx variants), local_irq_save/restore(flags) are added inside this_cpu_inc operation, so that trace_hardirqs_off_caller will be called by trace_hardirqs_on_caller directly because __debug_atomic_inc is implemented as this_cpu_inc, which may trigger the lockdep warning[1], for example in the below ARM scenary: kernel_thread_helper /*irq disabled*/ ->trace_hardirqs_on_caller /*hardirqs_enabled was set*/ ->trace_hardirqs_off_caller /*hardirqs_enabled cleared*/ __this_cpu_add(redundant_hardirqs_on) ->trace_hardirqs_off_caller /*irq disabled, so call here*/ The 'unannotated irqs-on' warning will be triggered somewhere because irq is just enabled after the irq trace in kernel_thread_helper. [1], [ 0.162841] ------------[ cut here ]------------ [ 0.167694] WARNING: at kernel/lockdep.c:3493 check_flags+0xc0/0x1d0() [ 0.174468] Modules linked in: [ 0.177703] Backtrace: [ 0.180328] [<c00171f0>] (dump_backtrace+0x0/0x110) from [<c0412320>] (dump_stack+0x18/0x1c) [ 0.189086] r6:c051f778 r5:00000da5 r4:00000000 r3:60000093 [ 0.195007] [<c0412308>] (dump_stack+0x0/0x1c) from [<c00410e8>] (warn_slowpath_common+0x54/0x6c) [ 0.204223] [<c0041094>] (warn_slowpath_common+0x0/0x6c) from [<c0041124>] (warn_slowpath_null+0x24/0x2c) [ 0.214111] r8:00000000 r7:00000000 r6:ee069598 r5:60000013 r4:ee082000 [ 0.220825] r3:00000009 [ 0.223693] [<c0041100>] (warn_slowpath_null+0x0/0x2c) from [<c0088f38>] (check_flags+0xc0/0x1d0) [ 0.232910] [<c0088e78>] (check_flags+0x0/0x1d0) from [<c008d348>] (lock_acquire+0x4c/0x11c) [ 0.241668] [<c008d2fc>] (lock_acquire+0x0/0x11c) from [<c0415aa4>] (_raw_spin_lock+0x3c/0x74) [ 0.250610] [<c0415a68>] (_raw_spin_lock+0x0/0x74) from [<c010a844>] (set_task_comm+0x20/0xc0) [ 0.259521] r6:ee069588 r5:ee0691c0 r4:ee082000 [ 0.264404] [<c010a824>] (set_task_comm+0x0/0xc0) from [<c0060780>] (kthreadd+0x28/0x108) [ 0.272857] r8:00000000 r7:00000013 r6:c0044a08 r5:ee0691c0 r4:ee082000 [ 0.279571] r3:ee083fe0 [ 0.282470] [<c0060758>] (kthreadd+0x0/0x108) from [<c0044a08>] (do_exit+0x0/0x6dc) [ 0.290405] r5:c0060758 r4:00000000 [ 0.294189] ---[ end trace 1b75b31a2719ed1c ]--- [ 0.299041] possible reason: unannotated irqs-on. [ 0.303955] irq event stamp: 5 [ 0.307159] hardirqs last enabled at (4): [<c001331c>] no_work_pending+0x8/0x2c [ 0.314880] hardirqs last disabled at (5): [<c0089b08>] trace_hardirqs_on_caller+0x60/0x26c [ 0.323547] softirqs last enabled at (0): [<c003f754>] copy_process+0x33c/0xef4 [ 0.331207] softirqs last disabled at (0): [< (null)>] (null) [ 0.337585] CPU0: thread -1, cpu 0, socket 0, mpidr 80000000 Acked-by: Christoph Lameter <cl@linux.com> Signed-off-by: Ming Lei <tom.leiming@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/percpu.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 3b609eb9cd7d..594c0040fdd8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -348,9 +348,9 @@ do { \ #define _this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long flags; \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ *__this_cpu_ptr(&(pcp)) op val; \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ } while (0) #ifndef this_cpu_write @@ -449,10 +449,10 @@ do { \ ({ \ typeof(pcp) ret__; \ unsigned long flags; \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ __this_cpu_add(pcp, val); \ ret__ = __this_cpu_read(pcp); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ ret__; \ }) @@ -479,10 +479,10 @@ do { \ #define _this_cpu_generic_xchg(pcp, nval) \ ({ typeof(pcp) ret__; \ unsigned long flags; \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ ret__ = __this_cpu_read(pcp); \ __this_cpu_write(pcp, nval); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ ret__; \ }) @@ -507,11 +507,11 @@ do { \ ({ \ typeof(pcp) ret__; \ unsigned long flags; \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ ret__ = __this_cpu_read(pcp); \ if (ret__ == (oval)) \ __this_cpu_write(pcp, nval); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ ret__; \ }) @@ -544,10 +544,10 @@ do { \ ({ \ int ret__; \ unsigned long flags; \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ ret__ = __this_cpu_generic_cmpxchg_double(pcp1, pcp2, \ oval1, oval2, nval1, nval2); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ ret__; \ }) -- cgit v1.2.3-70-g09d2 From 7e55d0527e4925a49464a5b26fdabae1f7a91a77 Mon Sep 17 00:00:00 2001 From: viresh kumar <viresh.kumar@st.com> Date: Thu, 23 Feb 2012 04:41:05 +0100 Subject: ARM: 7339/1: amba/serial.h: Include types.h for resolving dependency of type bool serial.h uses bool, but its definition is missing, as it doesn't include types.h. Fix this by including types.h Signed-off-by: Viresh Kumar <viresh.kumar@st.com> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> --- include/linux/amba/serial.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h index 514ed45c462e..d117b29d1062 100644 --- a/include/linux/amba/serial.h +++ b/include/linux/amba/serial.h @@ -23,6 +23,8 @@ #ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H #define ASM_ARM_HARDWARE_SERIAL_AMBA_H +#include <linux/types.h> + /* ------------------------------------------------------------------------------- * From AMBA UART (PL010) Block Specification * ------------------------------------------------------------------------------- -- cgit v1.2.3-70-g09d2 From ecb971923614775a118bc05ad16b2bde450cac7d Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Mon, 27 Feb 2012 17:52:52 -0500 Subject: tcp: fix comment for tp->highest_sack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was an off-by-one error in the comments describing the highest_sack field in struct tcp_sock. The comments previously claimed that it was the "start sequence of the highest skb with SACKed bit". This commit fixes the comments to note that it is the "start sequence of the skb just *after* the highest skb with SACKed bit". Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/tcp.h | 3 ++- include/net/tcp.h | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 46a85c9e1f25..3c7ffdb40dc6 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -412,7 +412,8 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - struct sk_buff *highest_sack; /* highest skb with SACK received + struct sk_buff *highest_sack; /* skb just after the highest + * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 42c29bfbcee3..2d80c291fffb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1364,8 +1364,9 @@ static inline void tcp_push_pending_frames(struct sock *sk) } } -/* Start sequence of the highest skb with SACKed bit, valid only if - * sacked > 0 or when the caller has ensured validity by itself. +/* Start sequence of the skb just after the highest skb with SACKed + * bit, valid only if sacked_out > 0 or when the caller has ensured + * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { -- cgit v1.2.3-70-g09d2 From c8e252586f8d5de906385d8cf6385fee289a825e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 2 Mar 2012 10:43:48 -0800 Subject: regset: Prevent null pointer reference on readonly regsets The regset common infrastructure assumed that regsets would always have .get and .set methods, but not necessarily .active methods. Unfortunately people have since written regsets without .set methods. Rather than putting in stub functions everywhere, handle regsets with null .get or .set methods explicitly. Signed-off-by: H. Peter Anvin <hpa@zytor.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Roland McGrath <roland@hack.frob.com> Cc: <stable@vger.kernel.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/binfmt_elf.c | 2 +- include/linux/regset.h | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index bcb884e2d613..07d096c49920 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1421,7 +1421,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, for (i = 1; i < view->n; ++i) { const struct user_regset *regset = &view->regsets[i]; do_thread_regset_writeback(t->task, regset); - if (regset->core_note_type && + if (regset->core_note_type && regset->get && (!regset->active || regset->active(t->task, regset))) { int ret; size_t size = regset->n * regset->size; diff --git a/include/linux/regset.h b/include/linux/regset.h index 8abee6556223..5150fd16ef93 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -335,6 +335,9 @@ static inline int copy_regset_to_user(struct task_struct *target, { const struct user_regset *regset = &view->regsets[setno]; + if (!regset->get) + return -EOPNOTSUPP; + if (!access_ok(VERIFY_WRITE, data, size)) return -EIO; @@ -358,6 +361,9 @@ static inline int copy_regset_from_user(struct task_struct *target, { const struct user_regset *regset = &view->regsets[setno]; + if (!regset->set) + return -EOPNOTSUPP; + if (!access_ok(VERIFY_READ, data, size)) return -EIO; -- cgit v1.2.3-70-g09d2 From 5189fa19a4b2b4c3bec37c3a019d446148827717 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" <hpa@zytor.com> Date: Fri, 2 Mar 2012 10:43:49 -0800 Subject: regset: Return -EFAULT, not -EIO, on host-side memory fault There is only one error code to return for a bad user-space buffer pointer passed to a system call in the same address space as the system call is executed, and that is EFAULT. Furthermore, the low-level access routines, which catch most of the faults, return EFAULT already. Signed-off-by: H. Peter Anvin <hpa@zytor.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Roland McGrath <roland@hack.frob.com> Cc: <stable@vger.kernel.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/regset.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regset.h b/include/linux/regset.h index 5150fd16ef93..686f37327a49 100644 --- a/include/linux/regset.h +++ b/include/linux/regset.h @@ -339,7 +339,7 @@ static inline int copy_regset_to_user(struct task_struct *target, return -EOPNOTSUPP; if (!access_ok(VERIFY_WRITE, data, size)) - return -EIO; + return -EFAULT; return regset->get(target, regset, offset, size, NULL, data); } @@ -365,7 +365,7 @@ static inline int copy_regset_from_user(struct task_struct *target, return -EOPNOTSUPP; if (!access_ok(VERIFY_READ, data, size)) - return -EIO; + return -EFAULT; return regset->set(target, regset, offset, size, NULL, data); } -- cgit v1.2.3-70-g09d2 From 8966be90304b394fd6a2c5af7b6b3abe2df3889c Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Fri, 2 Mar 2012 14:23:30 -0800 Subject: vfs: trivial __d_lookup_rcu() cleanups These don't change any semantics, but they clean up the code a bit and mark some arguments appropriately 'const'. They came up as I was doing the word-at-a-time dcache name accessor code, and cleaning this up now allows me to send out a smaller relevant interesting patch for the experimental stuff. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/dcache.c | 13 ++++++++----- include/linux/dcache.h | 3 ++- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index fe19ac13f75f..138be96e25b6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -104,7 +104,7 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(struct dentry *parent, +static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned long hash) { hash += ((unsigned long) parent ^ GOLDEN_RATIO_PRIME) / L1_CACHE_BYTES; @@ -1717,8 +1717,9 @@ EXPORT_SYMBOL(d_add_ci); * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. */ -struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, - unsigned *seq, struct inode **inode) +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp, struct inode **inode) { unsigned int len = name->len; unsigned int hash = name->hash; @@ -1748,6 +1749,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; struct inode *i; const char *tname; int tlen; @@ -1756,7 +1758,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, continue; seqretry: - *seq = read_seqcount_begin(&dentry->d_seq); + seq = read_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) @@ -1771,7 +1773,7 @@ seqretry: * edge of memory when walking. If we could load this * atomically some other way, we could drop this check. */ - if (read_seqcount_retry(&dentry->d_seq, *seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto seqretry; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { if (parent->d_op->d_compare(parent, *inode, @@ -1788,6 +1790,7 @@ seqretry: * order to do anything useful with the returned dentry * anyway. */ + *seqp = seq; *inode = i; return dentry; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d64a55b23afd..61b24261e07a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -309,7 +309,8 @@ extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(struct dentry *, struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); extern struct dentry *__d_lookup(struct dentry *, struct qstr *); -extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, +extern struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, unsigned *seq, struct inode **inode); /** -- cgit v1.2.3-70-g09d2 From 0145acc202ca613b23b5383e55df3c32a92ad1bf Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Fri, 2 Mar 2012 14:32:59 -0800 Subject: vfs: uninline full_name_hash() .. and also use it in lookup_one_len() rather than open-coding it. There aren't any performance-critical users, so inlining it is silly. But it wouldn't matter if it wasn't for the fact that the word-at-a-time dentry name patches want to conditionally replace the function, and uninlining it sets the stage for that. So again, this is a preparatory patch that doesn't change any semantics, and only prepares for a much cleaner and testable word-at-a-time dentry name accessor patch. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/namei.c | 13 +++++++++---- include/linux/dcache.h | 9 +-------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index a780ea515c47..ec72fa1acb14 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1374,6 +1374,14 @@ static inline int can_lookup(struct inode *inode) return 1; } +unsigned int full_name_hash(const unsigned char *name, unsigned int len) +{ + unsigned long hash = init_name_hash(); + while (len--) + hash = partial_name_hash(*name++, hash); + return end_name_hash(hash); +} + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -1775,24 +1783,21 @@ static struct dentry *lookup_hash(struct nameidata *nd) struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; - unsigned long hash; unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); this.name = name; this.len = len; + this.hash = full_name_hash(name, len); if (!len) return ERR_PTR(-EACCES); - hash = init_name_hash(); while (len--) { c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return ERR_PTR(-EACCES); - hash = partial_name_hash(c, hash); } - this.hash = end_name_hash(hash); /* * See if the low-level filesystem might want * to use its own hash.. diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 61b24261e07a..f1c7eb8461be 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -89,14 +89,7 @@ static inline unsigned long end_name_hash(unsigned long hash) } /* Compute the hash for a name string. */ -static inline unsigned int -full_name_hash(const unsigned char *name, unsigned int len) -{ - unsigned long hash = init_name_hash(); - while (len--) - hash = partial_name_hash(*name++, hash); - return end_name_hash(hash); -} +extern unsigned int full_name_hash(const unsigned char *, unsigned int); /* * Try to keep struct dentry aligned on 64 byte cachelines (this will -- cgit v1.2.3-70-g09d2 From 5707c87f20bca9e76969bb4096149de6ef74cbb9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Fri, 2 Mar 2012 14:47:15 -0800 Subject: vfs: clarify and clean up dentry_cmp() It did some odd things for unclear reasons. As this is one of the functions that gets changed when doing word-at-a-time compares, this is yet another of the "don't change any semantics, but clean things up so that subsequent patches don't get obscured by the cleanups". Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/dcache.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f1c7eb8461be..4270bedd2308 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -54,18 +54,17 @@ extern struct dentry_stat_t dentry_stat; static inline int dentry_cmp(const unsigned char *cs, size_t scount, const unsigned char *ct, size_t tcount) { - int ret; if (scount != tcount) return 1; + do { - ret = (*cs != *ct); - if (ret) - break; + if (*cs != *ct) + return 1; cs++; ct++; tcount--; } while (tcount); - return ret; + return 0; } /* Name hashing routines. Initial hash value */ -- cgit v1.2.3-70-g09d2 From adb795062f89b8d67d295ee25e04034bccce6779 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov <khlebnikov@openvz.org> Date: Wed, 29 Feb 2012 00:41:12 +0400 Subject: percpu: fix __this_cpu_{sub,inc,dec}_return() definition This patch adds missed "__" prefixes, otherwise these functions works as irq/preemption safe. Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com> Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/percpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 594c0040fdd8..21638ae14e07 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -722,9 +722,9 @@ do { \ __pcpu_size_call_return2(__this_cpu_add_return_, pcp, val) #endif -#define __this_cpu_sub_return(pcp, val) this_cpu_add_return(pcp, -(val)) -#define __this_cpu_inc_return(pcp) this_cpu_add_return(pcp, 1) -#define __this_cpu_dec_return(pcp) this_cpu_add_return(pcp, -1) +#define __this_cpu_sub_return(pcp, val) __this_cpu_add_return(pcp, -(val)) +#define __this_cpu_inc_return(pcp) __this_cpu_add_return(pcp, 1) +#define __this_cpu_dec_return(pcp) __this_cpu_add_return(pcp, -1) #define __this_cpu_generic_xchg(pcp, nval) \ ({ typeof(pcp) ret__; \ -- cgit v1.2.3-70-g09d2 From 5483f18e986ed5267b923bec12b407845181350b Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 4 Mar 2012 15:51:42 -0800 Subject: vfs: move dentry_cmp from <linux/dcache.h> to fs/dcache.c It's only used inside fs/dcache.c, and we're going to play games with it for the word-at-a-time patches. This time we really don't even want to export it, because it really is an internal function to fs/dcache.c, and has been since it was introduced. Having it in that extremely hot header file (it's included in pretty much everything, thanks to <linux/fs.h>) is a disaster for testing different versions, and is utterly pointless. We really should have some kind of header file diet thing, where we figure out which parts of header files are really better off private and only result in more expensive compiles. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/dcache.c | 20 ++++++++++++++++++++ include/linux/dcache.h | 20 -------------------- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 138be96e25b6..bcbdb33fcc20 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -137,6 +137,26 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, } #endif +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, + const unsigned char *ct, size_t tcount) +{ + if (scount != tcount) + return 1; + + do { + if (*cs != *ct) + return 1; + cs++; + ct++; + tcount--; + } while (tcount); + return 0; +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4270bedd2308..ff5f5256d175 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -47,26 +47,6 @@ struct dentry_stat_t { }; extern struct dentry_stat_t dentry_stat; -/* - * Compare 2 name strings, return 0 if they match, otherwise non-zero. - * The strings are both count bytes long, and count is non-zero. - */ -static inline int dentry_cmp(const unsigned char *cs, size_t scount, - const unsigned char *ct, size_t tcount) -{ - if (scount != tcount) - return 1; - - do { - if (*cs != *ct) - return 1; - cs++; - ct++; - tcount--; - } while (tcount); - return 0; -} - /* Name hashing routines. Initial hash value */ /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ #define init_name_hash() 0 -- cgit v1.2.3-70-g09d2 From c22ab332902333f83766017478c1ef6607ace681 Mon Sep 17 00:00:00 2001 From: Matthew Garrett <mjg@redhat.com> Date: Mon, 5 Mar 2012 14:59:10 -0800 Subject: kmsg_dump: don't run on non-error paths by default Since commit 04c6862c055f ("kmsg_dump: add kmsg_dump() calls to the reboot, halt, poweroff and emergency_restart paths"), kmsg_dump() gets run on normal paths including poweroff and reboot. This is less than ideal given pstore implementations that can only represent single backtraces, since a reboot may overwrite a stored oops before it's been picked up by userspace. In addition, some pstore backends may have low performance and provide a significant delay in reboot as a result. This patch adds a printk.always_kmsg_dump kernel parameter (which can also be changed from userspace). Without it, the code will only be run on failure paths rather than on normal paths. The option can be enabled in environments where there's a desire to attempt to audit whether or not a reboot was cleanly requested or not. Signed-off-by: Matthew Garrett <mjg@redhat.com> Acked-by: Seiji Aguchi <seiji.aguchi@hds.com> Cc: Seiji Aguchi <seiji.aguchi@hds.com> Cc: David Woodhouse <dwmw2@infradead.org> Cc: Marco Stornelli <marco.stornelli@gmail.com> Cc: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Don Zickus <dzickus@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/kernel-parameters.txt | 6 ++++++ include/linux/kmsg_dump.h | 9 +++++++-- kernel/printk.c | 6 ++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 033d4e69b43b..d99fd9c0ec0e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2211,6 +2211,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. default: off. + printk.always_kmsg_dump= + Trigger kmsg_dump for cases other than kernel oops or + panics + Format: <bool> (1/Y/y=enable, 0/N/n=disable) + default: disabled + printk.time= Show timing data prefixed to each printk message line Format: <bool> (1/Y/y=enable, 0/N/n=disable) diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index fee66317e071..35f7237ec972 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -15,13 +15,18 @@ #include <linux/errno.h> #include <linux/list.h> +/* + * Keep this list arranged in rough order of priority. Anything listed after + * KMSG_DUMP_OOPS will not be logged by default unless printk.always_kmsg_dump + * is passed to the kernel. + */ enum kmsg_dump_reason { - KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, + KMSG_DUMP_OOPS, + KMSG_DUMP_EMERG, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, - KMSG_DUMP_EMERG, }; /** diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..32690a0b7a18 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -702,6 +702,9 @@ static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -1732,6 +1735,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) unsigned long l1, l2; unsigned long flags; + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) + return; + /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ -- cgit v1.2.3-70-g09d2 From c415c3b47ea2754659d915cca387a20999044163 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: introduce complete_vfork_done() No functional changes. Move the clear-and-complete-vfork_done code into the new trivial helper, complete_vfork_done(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 8 ++------ include/linux/sched.h | 1 + kernel/fork.c | 17 ++++++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 92ce83a11e90..dccdcec913e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1915,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion *vfork_done; int core_waiters = -EBUSY; init_completion(&core_state->startup); @@ -1934,11 +1933,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) * Make sure nobody is waiting for us to release the VM, * otherwise we can deadlock when we wait on each other */ - vfork_done = tsk->vfork_done; - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); if (core_waiters) wait_for_completion(&core_state->startup); diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d379a6bfd88..1b25a37f2aee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,6 +2291,7 @@ extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern void complete_vfork_done(struct task_struct *tsk); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); diff --git a/kernel/fork.c b/kernel/fork.c index e2cd3e2a5ae8..cf3d96379608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -668,6 +668,14 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } +void complete_vfork_done(struct task_struct *tsk) +{ + struct completion *vfork_done = tsk->vfork_done; + + tsk->vfork_done = NULL; + complete(vfork_done); +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. @@ -683,8 +691,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) */ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - struct completion *vfork_done = tsk->vfork_done; - /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX if (unlikely(tsk->robust_list)) { @@ -704,11 +710,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - /* notify parent sleeping on vfork() */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); /* * If we're exiting normally, clear a user-space tid field if -- cgit v1.2.3-70-g09d2 From d68b46fe16ad59b3a5f51ec73daaa5dc06753798 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: make it killable Make vfork() killable. Change do_fork(CLONE_VFORK) to do wait_for_completion_killable(). If it fails we do not return to the user-mode and never touch the memory shared with our child. However, in this case we should clear child->vfork_done before return, we use task_lock() in do_fork()->wait_for_vfork_done() and complete_vfork_done() to serialize with each other. Note: now that we use task_lock() we don't really need completion, we could turn task->vfork_done into "task_struct *wake_up_me" but this needs some complications. NOTE: this and the next patches do not affect in-kernel users of CLONE_VFORK, kernel threads run with all signals ignored including SIGKILL/SIGSTOP. However this is obviously the user-visible change. Not only a fatal signal can kill the vforking parent, a sub-thread can do execve or exit_group() and kill the thread sleeping in vfork(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/sched.h | 2 +- kernel/fork.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1b25a37f2aee..b6467711f12e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2372,7 +2372,7 @@ static inline int thread_group_empty(struct task_struct *p) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. + * ->cgroup.subsys[]. And ->vfork_done. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/kernel/fork.c b/kernel/fork.c index cf3d96379608..892c534ce6e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -670,10 +670,34 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) void complete_vfork_done(struct task_struct *tsk) { - struct completion *vfork_done = tsk->vfork_done; + struct completion *vfork; - tsk->vfork_done = NULL; - complete(vfork_done); + task_lock(tsk); + vfork = tsk->vfork_done; + if (likely(vfork)) { + tsk->vfork_done = NULL; + complete(vfork); + } + task_unlock(tsk); +} + +static int wait_for_vfork_done(struct task_struct *child, + struct completion *vfork) +{ + int killed; + + freezer_do_not_count(); + killed = wait_for_completion_killable(vfork); + freezer_count(); + + if (killed) { + task_lock(child); + child->vfork_done = NULL; + task_unlock(child); + } + + put_task_struct(child); + return killed; } /* Please note the differences between mmput and mm_release. @@ -717,7 +741,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. + * trouble, say, a killed vfork parent shouldn't touch this mm. + * Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid) { if (!(tsk->flags & PF_SIGNALED) && @@ -1551,6 +1576,7 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + get_task_struct(p); } /* @@ -1568,10 +1594,8 @@ long do_fork(unsigned long clone_flags, ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { - freezer_do_not_count(); - wait_for_completion(&vfork); - freezer_count(); - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); -- cgit v1.2.3-70-g09d2 From 57b59c4a1400fa6c34764eab2e35a8762dc05a09 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: coredump_wait: don't call complete_vfork_done() Now that CLONE_VFORK is killable, coredump_wait() no longer needs complete_vfork_done(). zap_threads() should find and kill all tasks with the same ->mm, this includes our parent if ->vfork_done is set. mm_release() becomes the only caller, unexport complete_vfork_done(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/exec.c | 14 ++------------ include/linux/sched.h | 1 - kernel/fork.c | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index dccdcec913e9..153dee14fe55 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1926,19 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (unlikely(core_waiters < 0)) - goto fail; - - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (tsk->vfork_done) - complete_vfork_done(tsk); - - if (core_waiters) + if (core_waiters > 0) wait_for_completion(&core_state->startup); -fail: + return core_waiters; } diff --git a/include/linux/sched.h b/include/linux/sched.h index b6467711f12e..11fcafaf4ae4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,7 +2291,6 @@ extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); -extern void complete_vfork_done(struct task_struct *tsk); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); diff --git a/kernel/fork.c b/kernel/fork.c index 892c534ce6e3..44b0e21af50e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -668,7 +668,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } -void complete_vfork_done(struct task_struct *tsk) +static void complete_vfork_done(struct task_struct *tsk) { struct completion *vfork; -- cgit v1.2.3-70-g09d2 From 6e27f63edbd7ab893258e16500171dd1270a1369 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 5 Mar 2012 14:59:14 -0800 Subject: vfork: kill PF_STARTING Previously it was (ab)used by utrace. Then it was wrongly used by the scheduler code. Currently it is not used, kill it before it finds the new erroneous user. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/sched.h | 1 - kernel/fork.c | 9 --------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 11fcafaf4ae4..0657368bd78f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1777,7 +1777,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ diff --git a/kernel/fork.c b/kernel/fork.c index 44b0e21af50e..26a7a6707fa7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1046,7 +1046,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; - new_flags |= PF_STARTING; p->flags = new_flags; } @@ -1579,14 +1578,6 @@ long do_fork(unsigned long clone_flags, get_task_struct(p); } - /* - * We set PF_STARTING at creation in case tracing wants to - * use this to distinguish a fully live task from one that - * hasn't finished SIGSTOP raising yet. Now we clear it - * and set the child going. - */ - p->flags &= ~PF_STARTING; - wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ -- cgit v1.2.3-70-g09d2 From 7512102cf64d36e3c7444480273623c7aab3563f Mon Sep 17 00:00:00 2001 From: Hugh Dickins <hughd@google.com> Date: Mon, 5 Mar 2012 14:59:18 -0800 Subject: memcg: fix GPF when cgroup removal races with last exit When moving tasks from old memcg (with move_charge_at_immigrate on new memcg), followed by removal of old memcg, hit General Protection Fault in mem_cgroup_lru_del_list() (called from release_pages called from free_pages_and_swap_cache from tlb_flush_mmu from tlb_finish_mmu from exit_mmap from mmput from exit_mm from do_exit). Somewhat reproducible, takes a few hours: the old struct mem_cgroup has been freed and poisoned by SLAB_DEBUG, but mem_cgroup_lru_del_list() is still trying to update its stats, and take page off lru before freeing. A task, or a charge, or a page on lru: each secures a memcg against removal. In this case, the last task has been moved out of the old memcg, and it is exiting: anonymous pages are uncharged one by one from the memcg, as they are zapped from its pagetables, so the charge gets down to 0; but the pages themselves are queued in an mmu_gather for freeing. Most of those pages will be on lru (and force_empty is careful to lru_add_drain_all, to add pages from pagevec to lru first), but not necessarily all: perhaps some have been isolated for page reclaim, perhaps some isolated for other reasons. So, force_empty may find no task, no charge and no page on lru, and let the removal proceed. There would still be no problem if these pages were immediately freed; but typically (and the put_page_testzero protocol demands it) they have to be added back to lru before they are found freeable, then removed from lru and freed. We don't see the issue when adding, because the mem_cgroup_iter() loops keep their own reference to the memcg being scanned; but when it comes to mem_cgroup_lru_del_list(). I believe this was not an issue in v3.2: there, PageCgroupAcctLRU and PageCgroupUsed flags were used (like a trick with mirrors) to deflect view of pc->mem_cgroup to the stable root_mem_cgroup when neither set. 38c5d72f3ebe ("memcg: simplify LRU handling by new rule") mercifully removed those convolutions, but left this General Protection Fault. But it's surprisingly easy to restore the old behaviour: just check PageCgroupUsed in mem_cgroup_lru_add_list() (which decides on which lruvec to add), and reset pc to root_mem_cgroup if page is uncharged. A risky change? just going back to how it worked before; testing, and an audit of uses of pc->mem_cgroup, show no problem. And there's a nice bonus: with mem_cgroup_lru_add_list() itself making sure that an uncharged page goes to root lru, mem_cgroup_reset_owner() no longer has any purpose, and we can safely revert 4e5f01c2b9b9 ("memcg: clear pc->mem_cgroup if necessary"). Calling update_page_reclaim_stat() after add_page_to_lru_list() in swap.c is not strictly necessary: the lru_lock there, with RCU before memcg structures are freed, makes mem_cgroup_get_reclaim_stat_from_page safe without that; but it seems cleaner to rely on one dependency less. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/memcontrol.h | 5 ----- mm/ksm.c | 11 ----------- mm/memcontrol.c | 30 +++++++++++++----------------- mm/migrate.c | 2 -- mm/swap.c | 8 +++++--- mm/swap_state.c | 10 ---------- 6 files changed, 18 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4d34356fe644..b80de520670b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -129,7 +129,6 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -extern void mem_cgroup_reset_owner(struct page *page); #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP extern int do_swap_account; #endif @@ -392,10 +391,6 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } - -static inline void mem_cgroup_reset_owner(struct page *page) -{ -} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/mm/ksm.c b/mm/ksm.c index 1925ffbfb27f..310544a379ae 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -28,7 +28,6 @@ #include <linux/kthread.h> #include <linux/wait.h> #include <linux/slab.h> -#include <linux/memcontrol.h> #include <linux/rbtree.h> #include <linux/memory.h> #include <linux/mmu_notifier.h> @@ -1572,16 +1571,6 @@ struct page *ksm_does_need_to_copy(struct page *page, new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (new_page) { - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); copy_user_highpage(new_page, page, address, vma); SetPageDirty(new_page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1097d8098f8c..d0e57a3cda18 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1042,6 +1042,19 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; + + /* + * Surreptitiously switch any uncharged page to root: + * an uncharged page off lru does nothing to secure + * its former mem_cgroup from sudden removal. + * + * Our caller holds lru_lock, and PageCgroupUsed is updated + * under page_cgroup lock: between them, they make all uses + * of pc->mem_cgroup safe. + */ + if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) + pc->mem_cgroup = memcg = root_mem_cgroup; + mz = page_cgroup_zoneinfo(memcg, page); /* compound_order() is stabilized through lru_lock */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); @@ -3029,23 +3042,6 @@ void mem_cgroup_uncharge_end(void) batch->memcg = NULL; } -/* - * A function for resetting pc->mem_cgroup for newly allocated pages. - * This function should be called if the newpage will be added to LRU - * before start accounting. - */ -void mem_cgroup_reset_owner(struct page *newpage) -{ - struct page_cgroup *pc; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(newpage); - VM_BUG_ON(PageCgroupUsed(pc)); - pc->mem_cgroup = root_mem_cgroup; -} - #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. diff --git a/mm/migrate.c b/mm/migrate.c index df141f60289e..1503b6b54ecb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -839,8 +839,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - mem_cgroup_reset_owner(newpage); - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto out; diff --git a/mm/swap.c b/mm/swap.c index fff1ff7fb9ad..14380e9fbe33 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -652,7 +652,7 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct zone* zone, struct page *page, struct page *page_tail) { - int active; + int uninitialized_var(active); enum lru_list lru; const int file = 0; @@ -672,7 +672,6 @@ void lru_add_page_tail(struct zone* zone, active = 0; lru = LRU_INACTIVE_ANON; } - update_page_reclaim_stat(zone, page_tail, file, active); } else { SetPageUnevictable(page_tail); lru = LRU_UNEVICTABLE; @@ -693,6 +692,9 @@ void lru_add_page_tail(struct zone* zone, list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } + + if (!PageUnevictable(page)) + update_page_reclaim_stat(zone, page_tail, file, active); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -710,8 +712,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg) SetPageLRU(page); if (active) SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); add_page_to_lru_list(zone, page, lru); + update_page_reclaim_stat(zone, page, file, active); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 470038a91873..ea6b32d61873 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -300,16 +300,6 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, new_page = alloc_page_vma(gfp_mask, vma, addr); if (!new_page) break; /* Out of memory */ - /* - * The memcg-specific accounting when moving - * pages around the LRU lists relies on the - * page's owner (memcg) to be valid. Usually, - * pages are assigned to a new owner before - * being put on the LRU list, but since this - * is not the case here, the stale owner from - * a previous allocation cycle must be reset. - */ - mem_cgroup_reset_owner(new_page); } /* -- cgit v1.2.3-70-g09d2