diff options
| author | Takashi Iwai <tiwai@suse.de> | 2012-01-31 15:13:14 +0100 | 
|---|---|---|
| committer | Takashi Iwai <tiwai@suse.de> | 2012-01-31 15:13:14 +0100 | 
| commit | ea51e5040e24eefe44d70bc654a237ca1f0225b0 (patch) | |
| tree | df2e5922dcdfafae62a10d8cd97f98121064fc23 /kernel/sched/core.c | |
| parent | 3422a47041b8cb8f14ac1e3926bcf711121df6dc (diff) | |
| parent | 8dbd52daee38adaae4d5a674bcca837e694a4f4c (diff) | |
Merge branch 'fix/asoc' into for-linus
Diffstat (limited to 'kernel/sched/core.c')
| -rw-r--r-- | kernel/sched/core.c | 8150 | 
1 files changed, 8150 insertions, 0 deletions
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c new file mode 100644 index 000000000000..df00cb09263e --- /dev/null +++ b/kernel/sched/core.c @@ -0,0 +1,8150 @@ +/* + *  kernel/sched/core.c + * + *  Kernel scheduler and related syscalls + * + *  Copyright (C) 1991-2002  Linus Torvalds + * + *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and + *		make semaphores SMP safe + *  1998-11-19	Implemented schedule_timeout() and related stuff + *		by Andrea Arcangeli + *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar: + *		hybrid priority-list and round-robin design with + *		an array-switch method of distributing timeslices + *		and per-CPU runqueues.  Cleanups and useful suggestions + *		by Davide Libenzi, preemptible kernel bits by Robert Love. + *  2003-09-03	Interactivity tuning by Con Kolivas. + *  2004-04-02	Scheduler domains code by Nick Piggin + *  2007-04-15  Work begun on replacing all interactivity tuning with a + *              fair scheduling design by Con Kolivas. + *  2007-05-05  Load balancing (smp-nice) and other improvements + *              by Peter Williams + *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith + *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri + *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins, + *              Thomas Gleixner, Mike Kravetz + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/uaccess.h> +#include <linux/highmem.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/capability.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/debug_locks.h> +#include <linux/perf_event.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/profile.h> +#include <linux/freezer.h> +#include <linux/vmalloc.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/pid_namespace.h> +#include <linux/smp.h> +#include <linux/threads.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/cpuset.h> +#include <linux/percpu.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/sysctl.h> +#include <linux/syscalls.h> +#include <linux/times.h> +#include <linux/tsacct_kern.h> +#include <linux/kprobes.h> +#include <linux/delayacct.h> +#include <linux/unistd.h> +#include <linux/pagemap.h> +#include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> +#include <linux/ftrace.h> +#include <linux/slab.h> +#include <linux/init_task.h> + +#include <asm/tlb.h> +#include <asm/irq_regs.h> +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif + +#include "sched.h" +#include "../workqueue_sched.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/sched.h> + +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +{ +	unsigned long delta; +	ktime_t soft, hard, now; + +	for (;;) { +		if (hrtimer_active(period_timer)) +			break; + +		now = hrtimer_cb_get_time(period_timer); +		hrtimer_forward(period_timer, now, period); + +		soft = hrtimer_get_softexpires(period_timer); +		hard = hrtimer_get_expires(period_timer); +		delta = ktime_to_ns(ktime_sub(hard, soft)); +		__hrtimer_start_range_ns(period_timer, soft, delta, +					 HRTIMER_MODE_ABS_PINNED, 0); +	} +} + +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +void update_rq_clock(struct rq *rq) +{ +	s64 delta; + +	if (rq->skip_clock_update > 0) +		return; + +	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; +	rq->clock += delta; +	update_rq_clock_task(rq, delta); +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled)	\ +	(1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "features.h" +	0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled)	\ +	#name , + +static __read_mostly char *sched_feat_names[] = { +#include "features.h" +	NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ +	int i; + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (!(sysctl_sched_features & (1UL << i))) +			seq_puts(m, "NO_"); +		seq_printf(m, "%s ", sched_feat_names[i]); +	} +	seq_puts(m, "\n"); + +	return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true  jump_label_key_enabled +#define jump_label_key__false jump_label_key_disabled + +#define SCHED_FEAT(name, enabled)	\ +	jump_label_key__##enabled , + +struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ +	if (jump_label_enabled(&sched_feat_keys[i])) +		jump_label_dec(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ +	if (!jump_label_enabled(&sched_feat_keys[i])) +		jump_label_inc(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, +		size_t cnt, loff_t *ppos) +{ +	char buf[64]; +	char *cmp; +	int neg = 0; +	int i; + +	if (cnt > 63) +		cnt = 63; + +	if (copy_from_user(&buf, ubuf, cnt)) +		return -EFAULT; + +	buf[cnt] = 0; +	cmp = strstrip(buf); + +	if (strncmp(cmp, "NO_", 3) == 0) { +		neg = 1; +		cmp += 3; +	} + +	for (i = 0; i < __SCHED_FEAT_NR; i++) { +		if (strcmp(cmp, sched_feat_names[i]) == 0) { +			if (neg) { +				sysctl_sched_features &= ~(1UL << i); +				sched_feat_disable(i); +			} else { +				sysctl_sched_features |= (1UL << i); +				sched_feat_enable(i); +			} +			break; +		} +	} + +	if (i == __SCHED_FEAT_NR) +		return -EINVAL; + +	*ppos += cnt; + +	return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ +	return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { +	.open		= sched_feat_open, +	.write		= sched_feat_write, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static __init int sched_init_debug(void) +{ +	debugfs_create_file("sched_features", 0644, NULL, NULL, +			&sched_feat_fops); + +	return 0; +} +late_initcall(sched_init_debug); +#endif /* CONFIG_SCHED_DEBUG */ + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +__read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + + + +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	lockdep_assert_held(&p->pi_lock); + +	for (;;) { +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		if (likely(rq == task_rq(p))) +			return rq; +		raw_spin_unlock(&rq->lock); +	} +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) +	__acquires(p->pi_lock) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	for (;;) { +		raw_spin_lock_irqsave(&p->pi_lock, *flags); +		rq = task_rq(p); +		raw_spin_lock(&rq->lock); +		if (likely(rq == task_rq(p))) +			return rq; +		raw_spin_unlock(&rq->lock); +		raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +	} +} + +static void __task_rq_unlock(struct rq *rq) +	__releases(rq->lock) +{ +	raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) +	__releases(rq->lock) +	__releases(p->pi_lock) +{ +	raw_spin_unlock(&rq->lock); +	raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) +	__acquires(rq->lock) +{ +	struct rq *rq; + +	local_irq_disable(); +	rq = this_rq(); +	raw_spin_lock(&rq->lock); + +	return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +static void hrtick_clear(struct rq *rq) +{ +	if (hrtimer_active(&rq->hrtick_timer)) +		hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ +	struct rq *rq = container_of(timer, struct rq, hrtick_timer); + +	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + +	raw_spin_lock(&rq->lock); +	update_rq_clock(rq); +	rq->curr->sched_class->task_tick(rq, rq->curr, 1); +	raw_spin_unlock(&rq->lock); + +	return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ +	struct rq *rq = arg; + +	raw_spin_lock(&rq->lock); +	hrtimer_restart(&rq->hrtick_timer); +	rq->hrtick_csd_pending = 0; +	raw_spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ +	struct hrtimer *timer = &rq->hrtick_timer; +	ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + +	hrtimer_set_expires(timer, time); + +	if (rq == this_rq()) { +		hrtimer_restart(timer); +	} else if (!rq->hrtick_csd_pending) { +		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); +		rq->hrtick_csd_pending = 1; +	} +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	int cpu = (int)(long)hcpu; + +	switch (action) { +	case CPU_UP_CANCELED: +	case CPU_UP_CANCELED_FROZEN: +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +	case CPU_DEAD: +	case CPU_DEAD_FROZEN: +		hrtick_clear(cpu_rq(cpu)); +		return NOTIFY_OK; +	} + +	return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ +	hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +void hrtick_start(struct rq *rq, u64 delay) +{ +	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, +			HRTIMER_MODE_REL_PINNED, 0); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP +	rq->hrtick_csd_pending = 0; + +	rq->hrtick_csd.flags = 0; +	rq->hrtick_csd.func = __hrtick_start; +	rq->hrtick_csd.info = rq; +#endif + +	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	rq->hrtick_timer.function = hrtick; +} +#else	/* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif	/* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +void resched_task(struct task_struct *p) +{ +	int cpu; + +	assert_raw_spin_locked(&task_rq(p)->lock); + +	if (test_tsk_need_resched(p)) +		return; + +	set_tsk_need_resched(p); + +	cpu = task_cpu(p); +	if (cpu == smp_processor_id()) +		return; + +	/* NEED_RESCHED must be visible before we test polling */ +	smp_mb(); +	if (!tsk_is_polling(p)) +		smp_send_reschedule(cpu); +} + +void resched_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	if (!raw_spin_trylock_irqsave(&rq->lock, flags)) +		return; +	resched_task(cpu_curr(cpu)); +	raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu.  This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ +	int cpu = smp_processor_id(); +	int i; +	struct sched_domain *sd; + +	rcu_read_lock(); +	for_each_domain(cpu, sd) { +		for_each_cpu(i, sched_domain_span(sd)) { +			if (!idle_cpu(i)) { +				cpu = i; +				goto unlock; +			} +		} +	} +unlock: +	rcu_read_unlock(); +	return cpu; +} +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	if (cpu == smp_processor_id()) +		return; + +	/* +	 * This is safe, as this function is called with the timer +	 * wheel base lock of (cpu) held. When the CPU is on the way +	 * to idle and has not yet set rq->curr to idle then it will +	 * be serialized on the timer wheel base lock and take the new +	 * timer into account automatically. +	 */ +	if (rq->curr != rq->idle) +		return; + +	/* +	 * We can set TIF_RESCHED on the idle task of the other CPU +	 * lockless. The worst case is that the other CPU runs the +	 * idle task through an additional NOOP schedule() +	 */ +	set_tsk_need_resched(rq->idle); + +	/* NEED_RESCHED must be visible before we test polling */ +	smp_mb(); +	if (!tsk_is_polling(rq->idle)) +		smp_send_reschedule(cpu); +} + +static inline bool got_nohz_idle_kick(void) +{ +	int cpu = smp_processor_id(); +	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); +} + +#else /* CONFIG_NO_HZ */ + +static inline bool got_nohz_idle_kick(void) +{ +	return false; +} + +#endif /* CONFIG_NO_HZ */ + +void sched_avg_update(struct rq *rq) +{ +	s64 period = sched_avg_period(); + +	while ((s64)(rq->clock - rq->age_stamp) > period) { +		/* +		 * Inline assembly required to prevent the compiler +		 * optimising this loop into a divmod call. +		 * See __iter_div_u64_rem() for another example of this. +		 */ +		asm("" : "+rm" (rq->age_stamp)); +		rq->age_stamp += period; +		rq->rt_avg /= 2; +	} +} + +#else /* !CONFIG_SMP */ +void resched_task(struct task_struct *p) +{ +	assert_raw_spin_locked(&task_rq(p)->lock); +	set_tsk_need_resched(p); +} +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ +			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +/* + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +int walk_tg_tree_from(struct task_group *from, +			     tg_visitor down, tg_visitor up, void *data) +{ +	struct task_group *parent, *child; +	int ret; + +	parent = from; + +down: +	ret = (*down)(parent, data); +	if (ret) +		goto out; +	list_for_each_entry_rcu(child, &parent->children, siblings) { +		parent = child; +		goto down; + +up: +		continue; +	} +	ret = (*up)(parent, data); +	if (ret || parent == from) +		goto out; + +	child = parent; +	parent = parent->parent; +	if (parent) +		goto up; +out: +	return ret; +} + +int tg_nop(struct task_group *tg, void *data) +{ +	return 0; +} +#endif + +void update_cpu_load(struct rq *this_rq); + +static void set_load_weight(struct task_struct *p) +{ +	int prio = p->static_prio - MAX_RT_PRIO; +	struct load_weight *load = &p->se.load; + +	/* +	 * SCHED_IDLE tasks get minimal weight: +	 */ +	if (p->policy == SCHED_IDLE) { +		load->weight = scale_load(WEIGHT_IDLEPRIO); +		load->inv_weight = WMULT_IDLEPRIO; +		return; +	} + +	load->weight = scale_load(prio_to_weight[prio]); +	load->inv_weight = prio_to_wmult[prio]; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ +	update_rq_clock(rq); +	sched_info_queued(p); +	p->sched_class->enqueue_task(rq, p, flags); +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ +	update_rq_clock(rq); +	sched_info_dequeued(p); +	p->sched_class->dequeue_task(rq, p, flags); +} + +/* + * activate_task - move a task to the runqueue. + */ +void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible--; + +	enqueue_task(rq, p, flags); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +{ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible++; + +	dequeue_task(rq, p, flags); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ +	sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ +	__this_cpu_inc(irq_time_seq.sequence); +	smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ +	smp_wmb(); +	__this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ +	u64 irq_time; +	unsigned seq; + +	do { +		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); +		irq_time = per_cpu(cpu_softirq_time, cpu) + +			   per_cpu(cpu_hardirq_time, cpu); +	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + +	return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ +	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ +	unsigned long flags; +	s64 delta; +	int cpu; + +	if (!sched_clock_irqtime) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); +	__this_cpu_add(irq_start_time, delta); + +	irq_time_write_begin(); +	/* +	 * We do not account for softirq time from ksoftirqd here. +	 * We want to continue accounting softirq time to ksoftirqd thread +	 * in that case, so as not to confuse scheduler with a special task +	 * that do not consume any time, but still wants to run. +	 */ +	if (hardirq_count()) +		__this_cpu_add(cpu_hardirq_time, delta); +	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) +		__this_cpu_add(cpu_softirq_time, delta); + +	irq_time_write_end(); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_PARAVIRT +static inline u64 steal_ticks(u64 steal) +{ +	if (unlikely(steal > NSEC_PER_SEC)) +		return div_u64(steal, TICK_NSEC); + +	return __iter_div_u64_rem(steal, TICK_NSEC, &steal); +} +#endif + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ +/* + * In theory, the compile should just see 0 here, and optimize out the call + * to sched_rt_avg_update. But I don't trust it... + */ +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +	s64 steal = 0, irq_delta = 0; +#endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + +	/* +	 * Since irq_time is only updated on {soft,}irq_exit, we might run into +	 * this case when a previous update_rq_clock() happened inside a +	 * {soft,}irq region. +	 * +	 * When this happens, we stop ->clock_task and only update the +	 * prev_irq_time stamp to account for the part that fit, so that a next +	 * update will consume the rest. This ensures ->clock_task is +	 * monotonic. +	 * +	 * It does however cause some slight miss-attribution of {soft,}irq +	 * time, a more accurate solution would be to update the irq_time using +	 * the current rq->clock timestamp, except that would require using +	 * atomic ops. +	 */ +	if (irq_delta > delta) +		irq_delta = delta; + +	rq->prev_irq_time += irq_delta; +	delta -= irq_delta; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING +	if (static_branch((¶virt_steal_rq_enabled))) { +		u64 st; + +		steal = paravirt_steal_clock(cpu_of(rq)); +		steal -= rq->prev_steal_time_rq; + +		if (unlikely(steal > delta)) +			steal = delta; + +		st = steal_ticks(steal); +		steal = st * TICK_NSEC; + +		rq->prev_steal_time_rq += steal; + +		delta -= steal; +	} +#endif + +	rq->clock_task += delta; + +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) +	if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) +		sched_rt_avg_update(rq, irq_delta + steal); +#endif +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int irqtime_account_hi_update(void) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	unsigned long flags; +	u64 latest_ns; +	int ret = 0; + +	local_irq_save(flags); +	latest_ns = this_cpu_read(cpu_hardirq_time); +	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) +		ret = 1; +	local_irq_restore(flags); +	return ret; +} + +static int irqtime_account_si_update(void) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	unsigned long flags; +	u64 latest_ns; +	int ret = 0; + +	local_irq_save(flags); +	latest_ns = this_cpu_read(cpu_softirq_time); +	if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) +		ret = 1; +	local_irq_restore(flags); +	return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime	(0) + +#endif + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ +	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; +	struct task_struct *old_stop = cpu_rq(cpu)->stop; + +	if (stop) { +		/* +		 * Make it appear like a SCHED_FIFO task, its something +		 * userspace knows about and won't get confused about. +		 * +		 * Also, it will make PI more or less work without too +		 * much confusion -- but then, stop work should not +		 * rely on PI working anyway. +		 */ +		sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + +		stop->sched_class = &stop_sched_class; +	} + +	cpu_rq(cpu)->stop = stop; + +	if (old_stop) { +		/* +		 * Reset it back to a normal scheduling class so that +		 * it can die in pieces. +		 */ +		old_stop->sched_class = &rt_sched_class; +	} +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ +	return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ +	int prio; + +	if (task_has_rt_policy(p)) +		prio = MAX_RT_PRIO-1 - p->rt_priority; +	else +		prio = __normal_prio(p); +	return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ +	p->normal_prio = normal_prio(p); +	/* +	 * If we are RT tasks or we were boosted to RT priority, +	 * keep the priority unchanged. Otherwise, update priority +	 * to the normal priority: +	 */ +	if (!rt_prio(p->prio)) +		return p->normal_prio; +	return p->prio; +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ +	return cpu_curr(task_cpu(p)) == p; +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, +				       const struct sched_class *prev_class, +				       int oldprio) +{ +	if (prev_class != p->sched_class) { +		if (prev_class->switched_from) +			prev_class->switched_from(rq, p); +		p->sched_class->switched_to(rq, p); +	} else if (oldprio != p->prio) +		p->sched_class->prio_changed(rq, p, oldprio); +} + +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ +	const struct sched_class *class; + +	if (p->sched_class == rq->curr->sched_class) { +		rq->curr->sched_class->check_preempt_curr(rq, p, flags); +	} else { +		for_each_class(class) { +			if (class == rq->curr->sched_class) +				break; +			if (class == p->sched_class) { +				resched_task(rq->curr); +				break; +			} +		} +	} + +	/* +	 * A queue event has occurred, and we're going to schedule.  In +	 * this case, we can save a useless back to back clock update. +	 */ +	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) +		rq->skip_clock_update = 1; +} + +#ifdef CONFIG_SMP +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG +	/* +	 * We should never call set_task_cpu() on a blocked task, +	 * ttwu() will sort out the placement. +	 */ +	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && +			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP +	/* +	 * The caller should hold either p->pi_lock or rq->lock, when changing +	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. +	 * +	 * sched_move_task() holds both and thus holding either pins the cgroup, +	 * see set_task_rq(). +	 * +	 * Furthermore, all task_rq users should acquire both locks, see +	 * task_rq_lock(). +	 */ +	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || +				      lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif + +	trace_sched_migrate_task(p, new_cpu); + +	if (task_cpu(p) != new_cpu) { +		p->se.nr_migrations++; +		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); +	} + +	__set_task_cpu(p, new_cpu); +} + +struct migration_arg { +	struct task_struct *task; +	int dest_cpu; +}; + +static int migration_cpu_stop(void *data); + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change.  If it changes, i.e. @p might have woken up, + * then return zero.  When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count).  If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ +	unsigned long flags; +	int running, on_rq; +	unsigned long ncsw; +	struct rq *rq; + +	for (;;) { +		/* +		 * We do the initial early heuristics without holding +		 * any task-queue locks at all. We'll only try to get +		 * the runqueue lock when things look like they will +		 * work out! +		 */ +		rq = task_rq(p); + +		/* +		 * If the task is actively running on another CPU +		 * still, just relax and busy-wait without holding +		 * any locks. +		 * +		 * NOTE! Since we don't hold any locks, it's not +		 * even sure that "rq" stays as the right runqueue! +		 * But we don't care, since "task_running()" will +		 * return false if the runqueue has changed and p +		 * is actually now running somewhere else! +		 */ +		while (task_running(rq, p)) { +			if (match_state && unlikely(p->state != match_state)) +				return 0; +			cpu_relax(); +		} + +		/* +		 * Ok, time to look more closely! We need the rq +		 * lock now, to be *sure*. If we're wrong, we'll +		 * just go back and repeat. +		 */ +		rq = task_rq_lock(p, &flags); +		trace_sched_wait_task(p); +		running = task_running(rq, p); +		on_rq = p->on_rq; +		ncsw = 0; +		if (!match_state || p->state == match_state) +			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ +		task_rq_unlock(rq, p, &flags); + +		/* +		 * If it changed from the expected state, bail out now. +		 */ +		if (unlikely(!ncsw)) +			break; + +		/* +		 * Was it really running after all now that we +		 * checked with the proper locks actually held? +		 * +		 * Oops. Go back and try again.. +		 */ +		if (unlikely(running)) { +			cpu_relax(); +			continue; +		} + +		/* +		 * It's not enough that it's not actively running, +		 * it must be off the runqueue _entirely_, and not +		 * preempted! +		 * +		 * So if it was still runnable (but just not actively +		 * running right now), it's preempted, and we should +		 * yield - it could be a while. +		 */ +		if (unlikely(on_rq)) { +			ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + +			set_current_state(TASK_UNINTERRUPTIBLE); +			schedule_hrtimeout(&to, HRTIMER_MODE_REL); +			continue; +		} + +		/* +		 * Ahh, all good. It wasn't running, and it wasn't +		 * runnable, which means that it will never become +		 * running in the future either. We're all done! +		 */ +		break; +	} + +	return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ +	int cpu; + +	preempt_disable(); +	cpu = task_cpu(p); +	if ((cpu != smp_processor_id()) && task_curr(p)) +		smp_send_reschedule(cpu); +	preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ +	int dest_cpu; +	const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + +	/* Look for allowed, online CPU in same node. */ +	for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) +		if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) +			return dest_cpu; + +	/* Any allowed, online CPU? */ +	dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); +	if (dest_cpu < nr_cpu_ids) +		return dest_cpu; + +	/* No more Mr. Nice Guy. */ +	dest_cpu = cpuset_cpus_allowed_fallback(p); +	/* +	 * Don't tell them about moving exiting tasks or +	 * kernel threads (both mm NULL), since they never +	 * leave kernel. +	 */ +	if (p->mm && printk_ratelimit()) { +		printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", +				task_pid_nr(p), p->comm, cpu); +	} + +	return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ +	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + +	/* +	 * In order not to call set_task_cpu() on a blocking task we need +	 * to rely on ttwu() to place the task on a valid ->cpus_allowed +	 * cpu. +	 * +	 * Since this is common to all placement strategies, this lives here. +	 * +	 * [ this allows ->select_task() to simply return task_cpu(p) and +	 *   not worry about this generic constraint ] +	 */ +	if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || +		     !cpu_online(cpu))) +		cpu = select_fallback_rq(task_cpu(p), p); + +	return cpu; +} + +static void update_avg(u64 *avg, u64 sample) +{ +	s64 diff = sample - *avg; +	*avg += diff >> 3; +} +#endif + +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ +#ifdef CONFIG_SCHEDSTATS +	struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP +	int this_cpu = smp_processor_id(); + +	if (cpu == this_cpu) { +		schedstat_inc(rq, ttwu_local); +		schedstat_inc(p, se.statistics.nr_wakeups_local); +	} else { +		struct sched_domain *sd; + +		schedstat_inc(p, se.statistics.nr_wakeups_remote); +		rcu_read_lock(); +		for_each_domain(this_cpu, sd) { +			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { +				schedstat_inc(sd, ttwu_wake_remote); +				break; +			} +		} +		rcu_read_unlock(); +	} + +	if (wake_flags & WF_MIGRATED) +		schedstat_inc(p, se.statistics.nr_wakeups_migrate); + +#endif /* CONFIG_SMP */ + +	schedstat_inc(rq, ttwu_count); +	schedstat_inc(p, se.statistics.nr_wakeups); + +	if (wake_flags & WF_SYNC) +		schedstat_inc(p, se.statistics.nr_wakeups_sync); + +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ +	activate_task(rq, p, en_flags); +	p->on_rq = 1; + +	/* if a worker is waking up, notify workqueue */ +	if (p->flags & PF_WQ_WORKER) +		wq_worker_waking_up(p, cpu_of(rq)); +} + +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ +	trace_sched_wakeup(p, true); +	check_preempt_curr(rq, p, wake_flags); + +	p->state = TASK_RUNNING; +#ifdef CONFIG_SMP +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p); + +	if (rq->idle_stamp) { +		u64 delta = rq->clock - rq->idle_stamp; +		u64 max = 2*sysctl_sched_migration_cost; + +		if (delta > max) +			rq->avg_idle = max; +		else +			update_avg(&rq->avg_idle, delta); +		rq->idle_stamp = 0; +	} +#endif +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP +	if (p->sched_contributes_to_load) +		rq->nr_uninterruptible--; +#endif + +	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); +	ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ +	struct rq *rq; +	int ret = 0; + +	rq = __task_rq_lock(p); +	if (p->on_rq) { +		ttwu_do_wakeup(rq, p, wake_flags); +		ret = 1; +	} +	__task_rq_unlock(rq); + +	return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ +	struct rq *rq = this_rq(); +	struct llist_node *llist = llist_del_all(&rq->wake_list); +	struct task_struct *p; + +	raw_spin_lock(&rq->lock); + +	while (llist) { +		p = llist_entry(llist, struct task_struct, wake_entry); +		llist = llist_next(llist); +		ttwu_do_activate(rq, p, 0); +	} + +	raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ +	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) +		return; + +	/* +	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since +	 * traditionally all their work was done from the interrupt return +	 * path. Now that we actually do some work, we need to make sure +	 * we do call them. +	 * +	 * Some archs already do call them, luckily irq_enter/exit nest +	 * properly. +	 * +	 * Arguably we should visit all archs and update all handlers, +	 * however a fair share of IPIs are still resched only so this would +	 * somewhat pessimize the simple resched case. +	 */ +	irq_enter(); +	sched_ttwu_pending(); + +	/* +	 * Check if someone kicked us for doing the nohz idle load balance. +	 */ +	if (unlikely(got_nohz_idle_kick() && !need_resched())) { +		this_rq()->idle_balance = 1; +		raise_softirq_irqoff(SCHED_SOFTIRQ); +	} +	irq_exit(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ +	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) +		smp_send_reschedule(cpu); +} + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ +	struct rq *rq; +	int ret = 0; + +	rq = __task_rq_lock(p); +	if (p->on_cpu) { +		ttwu_activate(rq, p, ENQUEUE_WAKEUP); +		ttwu_do_wakeup(rq, p, wake_flags); +		ret = 1; +	} +	__task_rq_unlock(rq); + +	return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + +static inline int ttwu_share_cache(int this_cpu, int that_cpu) +{ +	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) +	if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { +		sched_clock_cpu(cpu); /* sync clocks x-cpu */ +		ttwu_queue_remote(p, cpu); +		return; +	} +#endif + +	raw_spin_lock(&rq->lock); +	ttwu_do_activate(rq, p, 0); +	raw_spin_unlock(&rq->lock); +} + +/** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. + */ +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +{ +	unsigned long flags; +	int cpu, success = 0; + +	smp_wmb(); +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	if (!(p->state & state)) +		goto out; + +	success = 1; /* we're going to change ->state */ +	cpu = task_cpu(p); + +	if (p->on_rq && ttwu_remote(p, wake_flags)) +		goto stat; + +#ifdef CONFIG_SMP +	/* +	 * If the owning (remote) cpu is still in the middle of schedule() with +	 * this task as prev, wait until its done referencing the task. +	 */ +	while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +		/* +		 * In case the architecture enables interrupts in +		 * context_switch(), we cannot busy wait, since that +		 * would lead to deadlocks when an interrupt hits and +		 * tries to wake up @prev. So bail and do a complete +		 * remote wakeup. +		 */ +		if (ttwu_activate_remote(p, wake_flags)) +			goto stat; +#else +		cpu_relax(); +#endif +	} +	/* +	 * Pairs with the smp_wmb() in finish_lock_switch(). +	 */ +	smp_rmb(); + +	p->sched_contributes_to_load = !!task_contributes_to_load(p); +	p->state = TASK_WAKING; + +	if (p->sched_class->task_waking) +		p->sched_class->task_waking(p); + +	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	if (task_cpu(p) != cpu) { +		wake_flags |= WF_MIGRATED; +		set_task_cpu(p, cpu); +	} +#endif /* CONFIG_SMP */ + +	ttwu_queue(p, cpu); +stat: +	ttwu_stat(p, cpu, wake_flags); +out: +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +	return success; +} + +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ +	struct rq *rq = task_rq(p); + +	BUG_ON(rq != this_rq()); +	BUG_ON(p == current); +	lockdep_assert_held(&rq->lock); + +	if (!raw_spin_trylock(&p->pi_lock)) { +		raw_spin_unlock(&rq->lock); +		raw_spin_lock(&p->pi_lock); +		raw_spin_lock(&rq->lock); +	} + +	if (!(p->state & TASK_NORMAL)) +		goto out; + +	if (!p->on_rq) +		ttwu_activate(rq, p, ENQUEUE_WAKEUP); + +	ttwu_do_wakeup(rq, p, 0); +	ttwu_stat(p, smp_processor_id(), 0); +out: +	raw_spin_unlock(&p->pi_lock); +} + +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes.  Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process(struct task_struct *p) +{ +	return try_to_wake_up(p, TASK_ALL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ +	return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ +	p->on_rq			= 0; + +	p->se.on_rq			= 0; +	p->se.exec_start		= 0; +	p->se.sum_exec_runtime		= 0; +	p->se.prev_sum_exec_runtime	= 0; +	p->se.nr_migrations		= 0; +	p->se.vruntime			= 0; +	INIT_LIST_HEAD(&p->se.group_node); + +#ifdef CONFIG_SCHEDSTATS +	memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif + +	INIT_LIST_HEAD(&p->rt.run_list); + +#ifdef CONFIG_PREEMPT_NOTIFIERS +	INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p) +{ +	unsigned long flags; +	int cpu = get_cpu(); + +	__sched_fork(p); +	/* +	 * We mark the process as running here. This guarantees that +	 * nobody will actually run it, and a signal or other external +	 * event cannot wake it up and insert it on the runqueue either. +	 */ +	p->state = TASK_RUNNING; + +	/* +	 * Make sure we do not leak PI boosting priority to the child. +	 */ +	p->prio = current->normal_prio; + +	/* +	 * Revert to default priority/policy on fork if requested. +	 */ +	if (unlikely(p->sched_reset_on_fork)) { +		if (task_has_rt_policy(p)) { +			p->policy = SCHED_NORMAL; +			p->static_prio = NICE_TO_PRIO(0); +			p->rt_priority = 0; +		} else if (PRIO_TO_NICE(p->static_prio) < 0) +			p->static_prio = NICE_TO_PRIO(0); + +		p->prio = p->normal_prio = __normal_prio(p); +		set_load_weight(p); + +		/* +		 * We don't need the reset flag anymore after the fork. It has +		 * fulfilled its duty: +		 */ +		p->sched_reset_on_fork = 0; +	} + +	if (!rt_prio(p->prio)) +		p->sched_class = &fair_sched_class; + +	if (p->sched_class->task_fork) +		p->sched_class->task_fork(p); + +	/* +	 * The child is not yet in the pid-hash so no cgroup attach races, +	 * and the cgroup is pinned to this child due to cgroup_fork() +	 * is ran before sched_fork(). +	 * +	 * Silence PROVE_RCU. +	 */ +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	set_task_cpu(p, cpu); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +	if (likely(sched_info_on())) +		memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) +	p->on_cpu = 0; +#endif +#ifdef CONFIG_PREEMPT_COUNT +	/* Want to start with kernel preemption disabled. */ +	task_thread_info(p)->preempt_count = 1; +#endif +#ifdef CONFIG_SMP +	plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif + +	put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_SMP +	/* +	 * Fork balancing, do it here and not earlier because: +	 *  - cpus_allowed can change in the fork path +	 *  - any previously selected cpu might disappear through hotplug +	 */ +	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +#endif + +	rq = __task_rq_lock(p); +	activate_task(rq, p, 0); +	p->on_rq = 1; +	trace_sched_wakeup_new(p, true); +	check_preempt_curr(rq, p, WF_FORK); +#ifdef CONFIG_SMP +	if (p->sched_class->task_woken) +		p->sched_class->task_woken(rq, p); +#endif +	task_rq_unlock(rq, p, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ +	hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ +	hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +	struct preempt_notifier *notifier; +	struct hlist_node *node; + +	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) +		notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, +				 struct task_struct *next) +{ +	struct preempt_notifier *notifier; +	struct hlist_node *node; + +	hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) +		notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, +				 struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, +		    struct task_struct *next) +{ +	sched_info_switch(prev, next); +	perf_event_task_sched_out(prev, next); +	fire_sched_out_preempt_notifiers(prev, next); +	prepare_lock_switch(rq, next); +	prepare_arch_switch(next); +	trace_sched_switch(prev, next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) +	__releases(rq->lock) +{ +	struct mm_struct *mm = rq->prev_mm; +	long prev_state; + +	rq->prev_mm = NULL; + +	/* +	 * A task struct has one reference for the use as "current". +	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls +	 * schedule one last time. The schedule call will never return, and +	 * the scheduled task must drop that reference. +	 * The test for TASK_DEAD must occur while the runqueue locks are +	 * still held, otherwise prev could be scheduled on another cpu, die +	 * there before we look at prev->state, and then the reference would +	 * be dropped twice. +	 *		Manfred Spraul <manfred@colorfullife.com> +	 */ +	prev_state = prev->state; +	finish_arch_switch(prev); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +	perf_event_task_sched_in(prev, current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +	local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +	finish_lock_switch(rq, prev); +	trace_sched_stat_sleeptime(current, rq->clock); + +	fire_sched_in_preempt_notifiers(current); +	if (mm) +		mmdrop(mm); +	if (unlikely(prev_state == TASK_DEAD)) { +		/* +		 * Remove function-return probe instances associated with this +		 * task and put them back on the free list. +		 */ +		kprobe_flush_task(prev); +		put_task_struct(prev); +	} +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ +	if (prev->sched_class->pre_schedule) +		prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ +	if (rq->post_schedule) { +		unsigned long flags; + +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->curr->sched_class->post_schedule) +			rq->curr->sched_class->post_schedule(rq); +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		rq->post_schedule = 0; +	} +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) +	__releases(rq->lock) +{ +	struct rq *rq = this_rq(); + +	finish_task_switch(rq, prev); + +	/* +	 * FIXME: do we need to worry about rq being invalidated by the +	 * task_switch? +	 */ +	post_schedule(rq); + +#ifdef __ARCH_WANT_UNLOCKED_CTXSW +	/* In this case, finish_task_switch does not reenable preemption */ +	preempt_enable(); +#endif +	if (current->set_child_tid) +		put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, +	       struct task_struct *next) +{ +	struct mm_struct *mm, *oldmm; + +	prepare_task_switch(rq, prev, next); + +	mm = next->mm; +	oldmm = prev->active_mm; +	/* +	 * For paravirt, this is coupled with an exit in switch_to to +	 * combine the page table reload and the switch backend into +	 * one hypercall. +	 */ +	arch_start_context_switch(prev); + +	if (!mm) { +		next->active_mm = oldmm; +		atomic_inc(&oldmm->mm_count); +		enter_lazy_tlb(oldmm, next); +	} else +		switch_mm(oldmm, mm, next); + +	if (!prev->mm) { +		prev->active_mm = NULL; +		rq->prev_mm = oldmm; +	} +	/* +	 * Since the runqueue lock will be released by the next +	 * task (which is an invalid locking op but in the case +	 * of the scheduler it's an obvious special-case), so we +	 * do an early lockdep release here: +	 */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + +	/* Here we just switch the register state and the stack. */ +	switch_to(prev, next, prev); + +	barrier(); +	/* +	 * this_rq must be evaluated again because prev may have moved +	 * CPUs since it called schedule(), thus the 'rq' on its stack +	 * frame will be invalid. +	 */ +	finish_task_switch(this_rq(), prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ +	unsigned long i, sum = 0; + +	for_each_online_cpu(i) +		sum += cpu_rq(i)->nr_running; + +	return sum; +} + +unsigned long nr_uninterruptible(void) +{ +	unsigned long i, sum = 0; + +	for_each_possible_cpu(i) +		sum += cpu_rq(i)->nr_uninterruptible; + +	/* +	 * Since we read the counters lockless, it might be slightly +	 * inaccurate. Do not allow it to go below zero though: +	 */ +	if (unlikely((long)sum < 0)) +		sum = 0; + +	return sum; +} + +unsigned long long nr_context_switches(void) +{ +	int i; +	unsigned long long sum = 0; + +	for_each_possible_cpu(i) +		sum += cpu_rq(i)->nr_switches; + +	return sum; +} + +unsigned long nr_iowait(void) +{ +	unsigned long i, sum = 0; + +	for_each_possible_cpu(i) +		sum += atomic_read(&cpu_rq(i)->nr_iowait); + +	return sum; +} + +unsigned long nr_iowait_cpu(int cpu) +{ +	struct rq *this = cpu_rq(cpu); +	return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ +	struct rq *this = this_rq(); +	return this->cpu_load[0]; +} + + +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static long calc_load_fold_active(struct rq *this_rq) +{ +	long nr_active, delta = 0; + +	nr_active = this_rq->nr_running; +	nr_active += (long) this_rq->nr_uninterruptible; + +	if (nr_active != this_rq->calc_load_active) { +		delta = nr_active - this_rq->calc_load_active; +		this_rq->calc_load_active = nr_active; +	} + +	return delta; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ +	load *= exp; +	load += active * (FIXED_1 - exp); +	load += 1UL << (FSHIFT - 1); +	return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ +/* + * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_tasks_idle; + +void calc_load_account_idle(struct rq *this_rq) +{ +	long delta; + +	delta = calc_load_fold_active(this_rq); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks_idle); +} + +static long calc_load_fold_idle(void) +{ +	long delta = 0; + +	/* +	 * Its got a race, we don't care... +	 */ +	if (atomic_long_read(&calc_load_tasks_idle)) +		delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + +	return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x:         base of the power + * @frac_bits: fractional bits of @x + * @n:         power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ +	unsigned long result = 1UL << frac_bits; + +	if (n) for (;;) { +		if (n & 1) { +			result *= x; +			result += 1UL << (frac_bits - 1); +			result >>= frac_bits; +		} +		n >>= 1; +		if (!n) +			break; +		x *= x; +		x += 1UL << (frac_bits - 1); +		x >>= frac_bits; +	} + +	return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + *    = (a0 * e + a * (1 - e)) * e + a * (1 - e) + *    = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + *  ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + *    = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + *              n         1 - x^(n+1) + *     S_n := \Sum x^i = ------------- + *             i=0          1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, +	    unsigned long active, unsigned int n) +{ + +	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ +	long delta, active, n; + +	if (time_before(jiffies, calc_load_update)) +		return; + +	/* +	 * If we crossed a calc_load_update boundary, make sure to fold +	 * any pending idle changes, the respective CPUs might have +	 * missed the tick driven calc_load_account_active() update +	 * due to NO_HZ. +	 */ +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	/* +	 * If we were idle for multiple load cycles, apply them. +	 */ +	if (ticks >= LOAD_FREQ) { +		n = ticks / LOAD_FREQ; + +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; + +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + +		calc_load_update += n * LOAD_FREQ; +	} + +	/* +	 * Its possible the remainder of the above division also crosses +	 * a LOAD_FREQ period, the regular check in calc_global_load() +	 * which comes after this will take care of that. +	 * +	 * Consider us being 11 ticks before a cycle completion, and us +	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will +	 * age us 4 cycles, and the test in calc_global_load() will +	 * pick up the final one. +	 */ +} +#else +void calc_load_account_idle(struct rq *this_rq) +{ +} + +static inline long calc_load_fold_idle(void) +{ +	return 0; +} + +static void calc_global_nohz(unsigned long ticks) +{ +} +#endif + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +} + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ +	long active; + +	calc_global_nohz(ticks); + +	if (time_before(jiffies, calc_load_update + 10)) +		return; + +	active = atomic_long_read(&calc_load_tasks); +	active = active > 0 ? active * FIXED_1 : 0; + +	avenrun[0] = calc_load(avenrun[0], EXP_1, active); +	avenrun[1] = calc_load(avenrun[1], EXP_5, active); +	avenrun[2] = calc_load(avenrun[2], EXP_15, active); + +	calc_load_update += LOAD_FREQ; +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ +	long delta; + +	if (time_before(jiffies, this_rq->calc_load_update)) +		return; + +	delta  = calc_load_fold_active(this_rq); +	delta += calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); + +	this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT		7 +static const unsigned char +		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char +		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { +					{0, 0, 0, 0, 0, 0, 0, 0}, +					{64, 32, 8, 0, 0, 0, 0, 0}, +					{96, 72, 40, 12, 1, 0, 0}, +					{112, 98, 75, 43, 15, 1, 0}, +					{120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ +	int j = 0; + +	if (!missed_updates) +		return load; + +	if (missed_updates >= degrade_zero_ticks[idx]) +		return 0; + +	if (idx == 1) +		return load >> missed_updates; + +	while (missed_updates) { +		if (missed_updates % 2) +			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + +		missed_updates >>= 1; +		j++; +	} +	return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +void update_cpu_load(struct rq *this_rq) +{ +	unsigned long this_load = this_rq->load.weight; +	unsigned long curr_jiffies = jiffies; +	unsigned long pending_updates; +	int i, scale; + +	this_rq->nr_load_updates++; + +	/* Avoid repeated calls on same jiffy, when moving in and out of idle */ +	if (curr_jiffies == this_rq->last_load_update_tick) +		return; + +	pending_updates = curr_jiffies - this_rq->last_load_update_tick; +	this_rq->last_load_update_tick = curr_jiffies; + +	/* Update our load: */ +	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ +	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { +		unsigned long old_load, new_load; + +		/* scale is effectively 1 << i now, and >> i divides by scale */ + +		old_load = this_rq->cpu_load[i]; +		old_load = decay_load_missed(old_load, pending_updates - 1, i); +		new_load = this_load; +		/* +		 * Round up the averaging division if load is increasing. This +		 * prevents us from getting stuck on 9 if the load is 10, for +		 * example. +		 */ +		if (new_load > old_load) +			new_load += scale - 1; + +		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; +	} + +	sched_avg_update(this_rq); +} + +static void update_cpu_load_active(struct rq *this_rq) +{ +	update_cpu_load(this_rq); + +	calc_load_account_active(this_rq); +} + +#ifdef CONFIG_SMP + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ +	struct task_struct *p = current; +	unsigned long flags; +	int dest_cpu; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +	if (dest_cpu == smp_processor_id()) +		goto unlock; + +	if (likely(cpu_active(dest_cpu))) { +		struct migration_arg arg = { p, dest_cpu }; + +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); +		return; +	} +unlock: +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); + +EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); + +/* + * Return any ns on the sched_clock that have not yet been accounted in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ +	u64 ns = 0; + +	if (task_current(rq, p)) { +		update_rq_clock(rq); +		ns = rq->clock_task - p->se.exec_start; +		if ((s64)ns < 0) +			ns = 0; +	} + +	return ns; +} + +unsigned long long task_delta_exec(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; +	u64 ns = 0; + +	rq = task_rq_lock(p, &flags); +	ns = do_task_delta_exec(p, rq); +	task_rq_unlock(rq, p, &flags); + +	return ns; +} + +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ +	unsigned long flags; +	struct rq *rq; +	u64 ns = 0; + +	rq = task_rq_lock(p, &flags); +	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); +	task_rq_unlock(rq, p, &flags); + +	return ns; +} + +#ifdef CONFIG_CGROUP_CPUACCT +struct cgroup_subsys cpuacct_subsys; +struct cpuacct root_cpuacct; +#endif + +static inline void task_group_account_field(struct task_struct *p, int index, +					    u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT +	struct kernel_cpustat *kcpustat; +	struct cpuacct *ca; +#endif +	/* +	 * Since all updates are sure to touch the root cgroup, we +	 * get ourselves ahead and touch it first. If the root cgroup +	 * is the only cgroup, then nothing else should be necessary. +	 * +	 */ +	__get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT +	if (unlikely(!cpuacct_subsys.active)) +		return; + +	rcu_read_lock(); +	ca = task_ca(p); +	while (ca && (ca != &root_cpuacct)) { +		kcpustat = this_cpu_ptr(ca->cpustat); +		kcpustat->cpustat[index] += tmp; +		ca = parent_ca(ca); +	} +	rcu_read_unlock(); +#endif +} + + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, +		       cputime_t cputime_scaled) +{ +	int index; + +	/* Add user time to process. */ +	p->utime += cputime; +	p->utimescaled += cputime_scaled; +	account_group_user_time(p, cputime); + +	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + +	/* Add user time to cpustat. */ +	task_group_account_field(p, index, (__force u64) cputime); + +	/* Account for user time used */ +	acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, +			       cputime_t cputime_scaled) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	/* Add guest time to process. */ +	p->utime += cputime; +	p->utimescaled += cputime_scaled; +	account_group_user_time(p, cputime); +	p->gtime += cputime; + +	/* Add guest time to cpustat. */ +	if (TASK_NICE(p) > 0) { +		cpustat[CPUTIME_NICE] += (__force u64) cputime; +		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; +	} else { +		cpustat[CPUTIME_USER] += (__force u64) cputime; +		cpustat[CPUTIME_GUEST] += (__force u64) cputime; +	} +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, +			cputime_t cputime_scaled, int index) +{ +	/* Add system time to process. */ +	p->stime += cputime; +	p->stimescaled += cputime_scaled; +	account_group_system_time(p, cputime); + +	/* Add system time to cpustat. */ +	task_group_account_field(p, index, (__force u64) cputime); + +	/* Account for system time used */ +	acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, +			 cputime_t cputime, cputime_t cputime_scaled) +{ +	int index; + +	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { +		account_guest_time(p, cputime, cputime_scaled); +		return; +	} + +	if (hardirq_count() - hardirq_offset) +		index = CPUTIME_IRQ; +	else if (in_serving_softirq()) +		index = CPUTIME_SOFTIRQ; +	else +		index = CPUTIME_SYSTEM; + +	__account_system_time(p, cputime, cputime_scaled, index); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	cpustat[CPUTIME_STEAL] += (__force u64) cputime; +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ +	u64 *cpustat = kcpustat_this_cpu->cpustat; +	struct rq *rq = this_rq(); + +	if (atomic_read(&rq->nr_iowait) > 0) +		cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; +	else +		cpustat[CPUTIME_IDLE] += (__force u64) cputime; +} + +static __always_inline bool steal_account_process_tick(void) +{ +#ifdef CONFIG_PARAVIRT +	if (static_branch(¶virt_steal_enabled)) { +		u64 steal, st = 0; + +		steal = paravirt_steal_clock(smp_processor_id()); +		steal -= this_rq()->prev_steal_time; + +		st = steal_ticks(steal); +		this_rq()->prev_steal_time += st * TICK_NSEC; + +		account_steal_time(st); +		return st; +	} +#endif +	return false; +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + *   - check for guest_time + *   - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +						struct rq *rq) +{ +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	u64 *cpustat = kcpustat_this_cpu->cpustat; + +	if (steal_account_process_tick()) +		return; + +	if (irqtime_account_hi_update()) { +		cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; +	} else if (irqtime_account_si_update()) { +		cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; +	} else if (this_cpu_ksoftirqd() == p) { +		/* +		 * ksoftirqd time do not get accounted in cpu_softirq_time. +		 * So, we have to handle it separately here. +		 * Also, p->stime needs to be updated for ksoftirqd. +		 */ +		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, +					CPUTIME_SOFTIRQ); +	} else if (user_tick) { +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +	} else if (p == rq->idle) { +		account_idle_time(cputime_one_jiffy); +	} else if (p->flags & PF_VCPU) { /* System time or guest time */ +		account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); +	} else { +		__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, +					CPUTIME_SYSTEM); +	} +} + +static void irqtime_account_idle_ticks(int ticks) +{ +	int i; +	struct rq *rq = this_rq(); + +	for (i = 0; i < ticks; i++) +		irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, +						struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ +	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); +	struct rq *rq = this_rq(); + +	if (sched_clock_irqtime) { +		irqtime_account_process_tick(p, user_tick, rq); +		return; +	} + +	if (steal_account_process_tick()) +		return; + +	if (user_tick) +		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); +	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) +		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, +				    one_jiffy_scaled); +	else +		account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ +	account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + +	if (sched_clock_irqtime) { +		irqtime_account_idle_ticks(ticks); +		return; +	} + +	account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	*ut = p->utime; +	*st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	struct task_cputime cputime; + +	thread_group_cputime(p, &cputime); + +	*ut = cputime.utime; +	*st = cputime.stime; +} +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	cputime_t rtime, utime = p->utime, total = utime + p->stime; + +	/* +	 * Use CFS's precise accounting: +	 */ +	rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + +	if (total) { +		u64 temp = (__force u64) rtime; + +		temp *= (__force u64) utime; +		do_div(temp, (__force u32) total); +		utime = (__force cputime_t) temp; +	} else +		utime = rtime; + +	/* +	 * Compare with previous values, to keep monotonicity: +	 */ +	p->prev_utime = max(p->prev_utime, utime); +	p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + +	*ut = p->prev_utime; +	*st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ +	struct signal_struct *sig = p->signal; +	struct task_cputime cputime; +	cputime_t rtime, utime, total; + +	thread_group_cputime(p, &cputime); + +	total = cputime.utime + cputime.stime; +	rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + +	if (total) { +		u64 temp = (__force u64) rtime; + +		temp *= (__force u64) cputime.utime; +		do_div(temp, (__force u32) total); +		utime = (__force cputime_t) temp; +	} else +		utime = rtime; + +	sig->prev_utime = max(sig->prev_utime, utime); +	sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); + +	*ut = sig->prev_utime; +	*st = sig->prev_stime; +} +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) +{ +	int cpu = smp_processor_id(); +	struct rq *rq = cpu_rq(cpu); +	struct task_struct *curr = rq->curr; + +	sched_clock_tick(); + +	raw_spin_lock(&rq->lock); +	update_rq_clock(rq); +	update_cpu_load_active(rq); +	curr->sched_class->task_tick(rq, curr, 0); +	raw_spin_unlock(&rq->lock); + +	perf_event_task_tick(); + +#ifdef CONFIG_SMP +	rq->idle_balance = idle_cpu(cpu); +	trigger_load_balance(rq, cpu); +#endif +} + +notrace unsigned long get_parent_ip(unsigned long addr) +{ +	if (in_lock_functions(addr)) { +		addr = CALLER_ADDR2; +		if (in_lock_functions(addr)) +			addr = CALLER_ADDR3; +	} +	return addr; +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ +				defined(CONFIG_PREEMPT_TRACER)) + +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Underflow? +	 */ +	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) +		return; +#endif +	preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Spinlock count overflowing soon? +	 */ +	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= +				PREEMPT_MASK - 10); +#endif +	if (preempt_count() == val) +		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT +	/* +	 * Underflow? +	 */ +	if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) +		return; +	/* +	 * Is the spinlock portion underflowing? +	 */ +	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && +			!(preempt_count() & PREEMPT_MASK))) +		return; +#endif + +	if (preempt_count() == val) +		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +	preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ +	struct pt_regs *regs = get_irq_regs(); + +	if (oops_in_progress) +		return; + +	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", +		prev->comm, prev->pid, preempt_count()); + +	debug_show_held_locks(prev); +	print_modules(); +	if (irqs_disabled()) +		print_irqtrace_events(prev); + +	if (regs) +		show_regs(regs); +	else +		dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ +	/* +	 * Test if we are atomic. Since do_exit() needs to call into +	 * schedule() atomically, we ignore that path for now. +	 * Otherwise, whine if we are scheduling when we should not be. +	 */ +	if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) +		__schedule_bug(prev); +	rcu_sleep_check(); + +	profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + +	schedstat_inc(this_rq(), sched_count); +} + +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ +	if (prev->on_rq || rq->skip_clock_update < 0) +		update_rq_clock(rq); +	prev->sched_class->put_prev_task(rq, prev); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq) +{ +	const struct sched_class *class; +	struct task_struct *p; + +	/* +	 * Optimization: we know that if all tasks are in +	 * the fair class we can call that function directly: +	 */ +	if (likely(rq->nr_running == rq->cfs.h_nr_running)) { +		p = fair_sched_class.pick_next_task(rq); +		if (likely(p)) +			return p; +	} + +	for_each_class(class) { +		p = class->pick_next_task(rq); +		if (p) +			return p; +	} + +	BUG(); /* the idle class will always have a runnable task */ +} + +/* + * __schedule() is the main scheduler function. + */ +static void __sched __schedule(void) +{ +	struct task_struct *prev, *next; +	unsigned long *switch_count; +	struct rq *rq; +	int cpu; + +need_resched: +	preempt_disable(); +	cpu = smp_processor_id(); +	rq = cpu_rq(cpu); +	rcu_note_context_switch(cpu); +	prev = rq->curr; + +	schedule_debug(prev); + +	if (sched_feat(HRTICK)) +		hrtick_clear(rq); + +	raw_spin_lock_irq(&rq->lock); + +	switch_count = &prev->nivcsw; +	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { +		if (unlikely(signal_pending_state(prev->state, prev))) { +			prev->state = TASK_RUNNING; +		} else { +			deactivate_task(rq, prev, DEQUEUE_SLEEP); +			prev->on_rq = 0; + +			/* +			 * If a worker went to sleep, notify and ask workqueue +			 * whether it wants to wake up a task to maintain +			 * concurrency. +			 */ +			if (prev->flags & PF_WQ_WORKER) { +				struct task_struct *to_wakeup; + +				to_wakeup = wq_worker_sleeping(prev, cpu); +				if (to_wakeup) +					try_to_wake_up_local(to_wakeup); +			} +		} +		switch_count = &prev->nvcsw; +	} + +	pre_schedule(rq, prev); + +	if (unlikely(!rq->nr_running)) +		idle_balance(cpu, rq); + +	put_prev_task(rq, prev); +	next = pick_next_task(rq); +	clear_tsk_need_resched(prev); +	rq->skip_clock_update = 0; + +	if (likely(prev != next)) { +		rq->nr_switches++; +		rq->curr = next; +		++*switch_count; + +		context_switch(rq, prev, next); /* unlocks the rq */ +		/* +		 * The context switch have flipped the stack from under us +		 * and restored the local variables which were saved when +		 * this task called schedule() in the past. prev == current +		 * is still correct, but it can be moved to another cpu/rq. +		 */ +		cpu = smp_processor_id(); +		rq = cpu_rq(cpu); +	} else +		raw_spin_unlock_irq(&rq->lock); + +	post_schedule(rq); + +	preempt_enable_no_resched(); +	if (need_resched()) +		goto need_resched; +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ +	if (!tsk->state) +		return; +	/* +	 * If we are going to sleep and we have plugged IO queued, +	 * make sure to submit it to avoid deadlocks. +	 */ +	if (blk_needs_flush_plug(tsk)) +		blk_schedule_flush_plug(tsk); +} + +asmlinkage void __sched schedule(void) +{ +	struct task_struct *tsk = current; + +	sched_submit_work(tsk); +	__schedule(); +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ +	if (lock->owner != owner) +		return false; + +	/* +	 * Ensure we emit the owner->on_cpu, dereference _after_ checking +	 * lock->owner still matches owner, if that fails, owner might +	 * point to free()d memory, if it still matches, the rcu_read_lock() +	 * ensures the memory stays valid. +	 */ +	barrier(); + +	return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ +	if (!sched_feat(OWNER_SPIN)) +		return 0; + +	rcu_read_lock(); +	while (owner_running(lock, owner)) { +		if (need_resched()) +			break; + +		arch_mutex_cpu_relax(); +	} +	rcu_read_unlock(); + +	/* +	 * We break out the loop above on need_resched() and when the +	 * owner changed, which is a sign for heavy contention. Return +	 * success only when lock->owner is NULL. +	 */ +	return lock->owner == NULL; +} +#endif + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched notrace preempt_schedule(void) +{ +	struct thread_info *ti = current_thread_info(); + +	/* +	 * If there is a non-zero preempt_count or interrupts are disabled, +	 * we do not want to preempt the current task. Just return.. +	 */ +	if (likely(ti->preempt_count || irqs_disabled())) +		return; + +	do { +		add_preempt_count_notrace(PREEMPT_ACTIVE); +		__schedule(); +		sub_preempt_count_notrace(PREEMPT_ACTIVE); + +		/* +		 * Check again in case we missed a preemption opportunity +		 * between schedule and now. +		 */ +		barrier(); +	} while (need_resched()); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ +	struct thread_info *ti = current_thread_info(); + +	/* Catch callers which need to be fixed */ +	BUG_ON(ti->preempt_count || !irqs_disabled()); + +	do { +		add_preempt_count(PREEMPT_ACTIVE); +		local_irq_enable(); +		__schedule(); +		local_irq_disable(); +		sub_preempt_count(PREEMPT_ACTIVE); + +		/* +		 * Check again in case we missed a preemption opportunity +		 * between schedule and now. +		 */ +		barrier(); +	} while (need_resched()); +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, +			  void *key) +{ +	return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, int wake_flags, void *key) +{ +	wait_queue_t *curr, *next; + +	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { +		unsigned flags = curr->flags; + +		if (curr->func(curr, mode, wake_flags, key) && +				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) +			break; +	} +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, 0, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ +	__wake_up_common(q, mode, 1, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; +	int wake_flags = WF_SYNC; + +	if (unlikely(!q)) +		return; + +	if (unlikely(!nr_exclusive)) +		wake_flags = 0; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done++; +	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done += UINT_MAX/2; +	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) +{ +	if (!x->done) { +		DECLARE_WAITQUEUE(wait, current); + +		__add_wait_queue_tail_exclusive(&x->wait, &wait); +		do { +			if (signal_pending_state(state, current)) { +				timeout = -ERESTARTSYS; +				break; +			} +			__set_current_state(state); +			spin_unlock_irq(&x->wait.lock); +			timeout = schedule_timeout(timeout); +			spin_lock_irq(&x->wait.lock); +		} while (!x->done && timeout); +		__remove_wait_queue(&x->wait, &wait); +		if (!x->done) +			return timeout; +	} +	x->done--; +	return timeout ?: 1; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ +	might_sleep(); + +	spin_lock_irq(&x->wait.lock); +	timeout = do_wait_for_common(x, timeout, state); +	spin_unlock_irq(&x->wait.lock); +	return timeout; +} + +/** + * wait_for_completion: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ +	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * The return value is 0 if timed out, and positive (at least 1, or number of + * jiffies left till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x:  holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, +					  unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * The return value is -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * The return value is -ERESTARTSYS if interrupted, 0 if timed out, + * positive (at least 1, or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, +				     unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + *	try_wait_for_completion - try to decrement a completion without blocking + *	@x:	completion structure + * + *	Returns: 0 if a decrement cannot be done without blocking + *		 1 if a decrement succeeded. + * + *	If a completion is being used as a counting completion, + *	attempt to decrement the counter without blocking. This + *	enables us to avoid waiting if the resource the completion + *	is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	else +		x->done--; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + *	completion_done - Test to see if a completion has any waiters + *	@x:	completion structure + * + *	Returns: 0 if there are waiters (wait_for_completion() in progress) + *		 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ +	unsigned long flags; +	wait_queue_t wait; + +	init_waitqueue_entry(&wait, current); + +	__set_current_state(state); + +	spin_lock_irqsave(&q->lock, flags); +	__add_wait_queue(q, &wait); +	spin_unlock(&q->lock); +	timeout = schedule_timeout(timeout); +	spin_lock_irq(&q->lock); +	__remove_wait_queue(q, &wait); +	spin_unlock_irqrestore(&q->lock, flags); + +	return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ +	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ +	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ +	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ +	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ +	int oldprio, on_rq, running; +	struct rq *rq; +	const struct sched_class *prev_class; + +	BUG_ON(prio < 0 || prio > MAX_PRIO); + +	rq = __task_rq_lock(p); + +	trace_sched_pi_setprio(p, prio); +	oldprio = p->prio; +	prev_class = p->sched_class; +	on_rq = p->on_rq; +	running = task_current(rq, p); +	if (on_rq) +		dequeue_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	if (rt_prio(prio)) +		p->sched_class = &rt_sched_class; +	else +		p->sched_class = &fair_sched_class; + +	p->prio = prio; + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + +	check_class_changed(rq, p, prev_class, oldprio); +	__task_rq_unlock(rq); +} + +#endif + +void set_user_nice(struct task_struct *p, long nice) +{ +	int old_prio, delta, on_rq; +	unsigned long flags; +	struct rq *rq; + +	if (TASK_NICE(p) == nice || nice < -20 || nice > 19) +		return; +	/* +	 * We have to be careful, if called from sys_setpriority(), +	 * the task might be in the middle of scheduling on another CPU. +	 */ +	rq = task_rq_lock(p, &flags); +	/* +	 * The RT priorities are set via sched_setscheduler(), but we still +	 * allow the 'normal' nice value to be set - but as expected +	 * it wont have any effect on scheduling until the task is +	 * SCHED_FIFO/SCHED_RR: +	 */ +	if (task_has_rt_policy(p)) { +		p->static_prio = NICE_TO_PRIO(nice); +		goto out_unlock; +	} +	on_rq = p->on_rq; +	if (on_rq) +		dequeue_task(rq, p, 0); + +	p->static_prio = NICE_TO_PRIO(nice); +	set_load_weight(p); +	old_prio = p->prio; +	p->prio = effective_prio(p); +	delta = p->prio - old_prio; + +	if (on_rq) { +		enqueue_task(rq, p, 0); +		/* +		 * If the task increased its priority or is running and +		 * lowered its priority, then reschedule its CPU: +		 */ +		if (delta < 0 || (delta > 0 && task_running(rq, p))) +			resched_task(rq->curr); +	} +out_unlock: +	task_rq_unlock(rq, p, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ +	/* convert nice value [19,-20] to rlimit style value [1,40] */ +	int nice_rlim = 20 - nice; + +	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || +		capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ +	long nice, retval; + +	/* +	 * Setpriority might change our priority at the same moment. +	 * We don't have to worry. Conceptually one call occurs first +	 * and we have a single winner. +	 */ +	if (increment < -40) +		increment = -40; +	if (increment > 40) +		increment = 40; + +	nice = TASK_NICE(current) + increment; +	if (nice < -20) +		nice = -20; +	if (nice > 19) +		nice = 19; + +	if (increment < 0 && !can_nice(current, nice)) +		return -EPERM; + +	retval = security_task_setnice(current, nice); +	if (retval) +		return retval; + +	set_user_nice(current, nice); +	return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ +	return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ +	return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ +	struct rq *rq = cpu_rq(cpu); + +	if (rq->curr != rq->idle) +		return 0; + +	if (rq->nr_running) +		return 0; + +#ifdef CONFIG_SMP +	if (!llist_empty(&rq->wake_list)) +		return 0; +#endif + +	return 1; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ +	return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ +	return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ +	p->policy = policy; +	p->rt_priority = prio; +	p->normal_prio = normal_prio(p); +	/* we are holding p->pi_lock already */ +	p->prio = rt_mutex_getprio(p); +	if (rt_prio(p->prio)) +		p->sched_class = &rt_sched_class; +	else +		p->sched_class = &fair_sched_class; +	set_load_weight(p); +} + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ +	const struct cred *cred = current_cred(), *pcred; +	bool match; + +	rcu_read_lock(); +	pcred = __task_cred(p); +	if (cred->user->user_ns == pcred->user->user_ns) +		match = (cred->euid == pcred->euid || +			 cred->euid == pcred->uid); +	else +		match = false; +	rcu_read_unlock(); +	return match; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, +				const struct sched_param *param, bool user) +{ +	int retval, oldprio, oldpolicy = -1, on_rq, running; +	unsigned long flags; +	const struct sched_class *prev_class; +	struct rq *rq; +	int reset_on_fork; + +	/* may grab non-irq protected spin_locks */ +	BUG_ON(in_interrupt()); +recheck: +	/* double check policy once rq lock held */ +	if (policy < 0) { +		reset_on_fork = p->sched_reset_on_fork; +		policy = oldpolicy = p->policy; +	} else { +		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); +		policy &= ~SCHED_RESET_ON_FORK; + +		if (policy != SCHED_FIFO && policy != SCHED_RR && +				policy != SCHED_NORMAL && policy != SCHED_BATCH && +				policy != SCHED_IDLE) +			return -EINVAL; +	} + +	/* +	 * Valid priorities for SCHED_FIFO and SCHED_RR are +	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, +	 * SCHED_BATCH and SCHED_IDLE is 0. +	 */ +	if (param->sched_priority < 0 || +	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || +	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) +		return -EINVAL; +	if (rt_policy(policy) != (param->sched_priority != 0)) +		return -EINVAL; + +	/* +	 * Allow unprivileged RT tasks to decrease priority: +	 */ +	if (user && !capable(CAP_SYS_NICE)) { +		if (rt_policy(policy)) { +			unsigned long rlim_rtprio = +					task_rlimit(p, RLIMIT_RTPRIO); + +			/* can't set/change the rt policy */ +			if (policy != p->policy && !rlim_rtprio) +				return -EPERM; + +			/* can't increase priority */ +			if (param->sched_priority > p->rt_priority && +			    param->sched_priority > rlim_rtprio) +				return -EPERM; +		} + +		/* +		 * Treat SCHED_IDLE as nice 20. Only allow a switch to +		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. +		 */ +		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { +			if (!can_nice(p, TASK_NICE(p))) +				return -EPERM; +		} + +		/* can't change other user's priorities */ +		if (!check_same_owner(p)) +			return -EPERM; + +		/* Normal users shall not reset the sched_reset_on_fork flag */ +		if (p->sched_reset_on_fork && !reset_on_fork) +			return -EPERM; +	} + +	if (user) { +		retval = security_task_setscheduler(p); +		if (retval) +			return retval; +	} + +	/* +	 * make sure no PI-waiters arrive (or leave) while we are +	 * changing the priority of the task: +	 * +	 * To be able to change p->policy safely, the appropriate +	 * runqueue lock must be held. +	 */ +	rq = task_rq_lock(p, &flags); + +	/* +	 * Changing the policy of the stop threads its a very bad idea +	 */ +	if (p == rq->stop) { +		task_rq_unlock(rq, p, &flags); +		return -EINVAL; +	} + +	/* +	 * If not changing anything there's no need to proceed further: +	 */ +	if (unlikely(policy == p->policy && (!rt_policy(policy) || +			param->sched_priority == p->rt_priority))) { + +		__task_rq_unlock(rq); +		raw_spin_unlock_irqrestore(&p->pi_lock, flags); +		return 0; +	} + +#ifdef CONFIG_RT_GROUP_SCHED +	if (user) { +		/* +		 * Do not allow realtime tasks into groups that have no runtime +		 * assigned. +		 */ +		if (rt_bandwidth_enabled() && rt_policy(policy) && +				task_group(p)->rt_bandwidth.rt_runtime == 0 && +				!task_group_is_autogroup(task_group(p))) { +			task_rq_unlock(rq, p, &flags); +			return -EPERM; +		} +	} +#endif + +	/* recheck policy now with rq lock held */ +	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { +		policy = oldpolicy = -1; +		task_rq_unlock(rq, p, &flags); +		goto recheck; +	} +	on_rq = p->on_rq; +	running = task_current(rq, p); +	if (on_rq) +		deactivate_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	p->sched_reset_on_fork = reset_on_fork; + +	oldprio = p->prio; +	prev_class = p->sched_class; +	__setscheduler(rq, p, policy, param->sched_priority); + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		activate_task(rq, p, 0); + +	check_class_changed(rq, p, prev_class, oldprio); +	task_rq_unlock(rq, p, &flags); + +	rt_mutex_adjust_pi(p); + +	return 0; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, +		       const struct sched_param *param) +{ +	return __sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission.  For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, +			       const struct sched_param *param) +{ +	return __sched_setscheduler(p, policy, param, false); +} + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ +	struct sched_param lparam; +	struct task_struct *p; +	int retval; + +	if (!param || pid < 0) +		return -EINVAL; +	if (copy_from_user(&lparam, param, sizeof(struct sched_param))) +		return -EFAULT; + +	rcu_read_lock(); +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (p != NULL) +		retval = sched_setscheduler(p, policy, &lparam); +	rcu_read_unlock(); + +	return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, +		struct sched_param __user *, param) +{ +	/* negative values for policy are not valid */ +	if (policy < 0) +		return -EINVAL; + +	return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ +	return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ +	struct task_struct *p; +	int retval; + +	if (pid < 0) +		return -EINVAL; + +	retval = -ESRCH; +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	if (p) { +		retval = security_task_getscheduler(p); +		if (!retval) +			retval = p->policy +				| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); +	} +	rcu_read_unlock(); +	return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ +	struct sched_param lp; +	struct task_struct *p; +	int retval; + +	if (!param || pid < 0) +		return -EINVAL; + +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	retval = -ESRCH; +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	lp.sched_priority = p->rt_priority; +	rcu_read_unlock(); + +	/* +	 * This one might sleep, we cannot do it with a spinlock held ... +	 */ +	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ +	cpumask_var_t cpus_allowed, new_mask; +	struct task_struct *p; +	int retval; + +	get_online_cpus(); +	rcu_read_lock(); + +	p = find_process_by_pid(pid); +	if (!p) { +		rcu_read_unlock(); +		put_online_cpus(); +		return -ESRCH; +	} + +	/* Prevent p going away */ +	get_task_struct(p); +	rcu_read_unlock(); + +	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_put_task; +	} +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { +		retval = -ENOMEM; +		goto out_free_cpus_allowed; +	} +	retval = -EPERM; +	if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) +		goto out_unlock; + +	retval = security_task_setscheduler(p); +	if (retval) +		goto out_unlock; + +	cpuset_cpus_allowed(p, cpus_allowed); +	cpumask_and(new_mask, in_mask, cpus_allowed); +again: +	retval = set_cpus_allowed_ptr(p, new_mask); + +	if (!retval) { +		cpuset_cpus_allowed(p, cpus_allowed); +		if (!cpumask_subset(new_mask, cpus_allowed)) { +			/* +			 * We must have raced with a concurrent cpuset +			 * update. Just reset the cpus_allowed to the +			 * cpuset's cpus_allowed +			 */ +			cpumask_copy(new_mask, cpus_allowed); +			goto again; +		} +	} +out_unlock: +	free_cpumask_var(new_mask); +out_free_cpus_allowed: +	free_cpumask_var(cpus_allowed); +out_put_task: +	put_task_struct(p); +	put_online_cpus(); +	return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, +			     struct cpumask *new_mask) +{ +	if (len < cpumask_size()) +		cpumask_clear(new_mask); +	else if (len > cpumask_size()) +		len = cpumask_size(); + +	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, +		unsigned long __user *, user_mask_ptr) +{ +	cpumask_var_t new_mask; +	int retval; + +	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) +		return -ENOMEM; + +	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); +	if (retval == 0) +		retval = sched_setaffinity(pid, new_mask); +	free_cpumask_var(new_mask); +	return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ +	struct task_struct *p; +	unsigned long flags; +	int retval; + +	get_online_cpus(); +	rcu_read_lock(); + +	retval = -ESRCH; +	p = find_process_by_pid(pid); +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	raw_spin_lock_irqsave(&p->pi_lock, flags); +	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +out_unlock: +	rcu_read_unlock(); +	put_online_cpus(); + +	return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, +		unsigned long __user *, user_mask_ptr) +{ +	int ret; +	cpumask_var_t mask; + +	if ((len * BITS_PER_BYTE) < nr_cpu_ids) +		return -EINVAL; +	if (len & (sizeof(unsigned long)-1)) +		return -EINVAL; + +	if (!alloc_cpumask_var(&mask, GFP_KERNEL)) +		return -ENOMEM; + +	ret = sched_getaffinity(pid, mask); +	if (ret == 0) { +		size_t retlen = min_t(size_t, len, cpumask_size()); + +		if (copy_to_user(user_mask_ptr, mask, retlen)) +			ret = -EFAULT; +		else +			ret = retlen; +	} +	free_cpumask_var(mask); + +	return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ +	struct rq *rq = this_rq_lock(); + +	schedstat_inc(rq, yld_count); +	current->sched_class->yield_task(rq); + +	/* +	 * Since we are going to call schedule() anyway, there's +	 * no need to preempt or enable interrupts: +	 */ +	__release(rq->lock); +	spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +	do_raw_spin_unlock(&rq->lock); +	preempt_enable_no_resched(); + +	schedule(); + +	return 0; +} + +static inline int should_resched(void) +{ +	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + +static void __cond_resched(void) +{ +	add_preempt_count(PREEMPT_ACTIVE); +	__schedule(); +	sub_preempt_count(PREEMPT_ACTIVE); +} + +int __sched _cond_resched(void) +{ +	if (should_resched()) { +		__cond_resched(); +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ +	int resched = should_resched(); +	int ret = 0; + +	lockdep_assert_held(lock); + +	if (spin_needbreak(lock) || resched) { +		spin_unlock(lock); +		if (resched) +			__cond_resched(); +		else +			cpu_relax(); +		ret = 1; +		spin_lock(lock); +	} +	return ret; +} +EXPORT_SYMBOL(__cond_resched_lock); + +int __sched __cond_resched_softirq(void) +{ +	BUG_ON(!in_softirq()); + +	if (should_resched()) { +		local_bh_enable(); +		__cond_resched(); +		local_bh_disable(); +		return 1; +	} +	return 0; +} +EXPORT_SYMBOL(__cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ +	set_current_state(TASK_RUNNING); +	sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ +	struct task_struct *curr = current; +	struct rq *rq, *p_rq; +	unsigned long flags; +	bool yielded = 0; + +	local_irq_save(flags); +	rq = this_rq(); + +again: +	p_rq = task_rq(p); +	double_rq_lock(rq, p_rq); +	while (task_rq(p) != p_rq) { +		double_rq_unlock(rq, p_rq); +		goto again; +	} + +	if (!curr->sched_class->yield_to_task) +		goto out; + +	if (curr->sched_class != p->sched_class) +		goto out; + +	if (task_running(p_rq, p) || p->state) +		goto out; + +	yielded = curr->sched_class->yield_to_task(rq, p, preempt); +	if (yielded) { +		schedstat_inc(rq, yld_count); +		/* +		 * Make p's CPU reschedule; pick_next_entity takes care of +		 * fairness. +		 */ +		if (preempt && rq != p_rq) +			resched_task(p_rq->curr); +	} else { +		/* +		 * We might have set it in task_yield_fair(), but are +		 * not going to schedule(), so don't want to skip +		 * the next update. +		 */ +		rq->skip_clock_update = 0; +	} + +out: +	double_rq_unlock(rq, p_rq); +	local_irq_restore(flags); + +	if (yielded) +		schedule(); + +	return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + */ +void __sched io_schedule(void) +{ +	struct rq *rq = raw_rq(); + +	delayacct_blkio_start(); +	atomic_inc(&rq->nr_iowait); +	blk_flush_plug(current); +	current->in_iowait = 1; +	schedule(); +	current->in_iowait = 0; +	atomic_dec(&rq->nr_iowait); +	delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ +	struct rq *rq = raw_rq(); +	long ret; + +	delayacct_blkio_start(); +	atomic_inc(&rq->nr_iowait); +	blk_flush_plug(current); +	current->in_iowait = 1; +	ret = schedule_timeout(timeout); +	current->in_iowait = 0; +	atomic_dec(&rq->nr_iowait); +	delayacct_blkio_end(); +	return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ +	int ret = -EINVAL; + +	switch (policy) { +	case SCHED_FIFO: +	case SCHED_RR: +		ret = MAX_USER_RT_PRIO-1; +		break; +	case SCHED_NORMAL: +	case SCHED_BATCH: +	case SCHED_IDLE: +		ret = 0; +		break; +	} +	return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ +	int ret = -EINVAL; + +	switch (policy) { +	case SCHED_FIFO: +	case SCHED_RR: +		ret = 1; +		break; +	case SCHED_NORMAL: +	case SCHED_BATCH: +	case SCHED_IDLE: +		ret = 0; +	} +	return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, +		struct timespec __user *, interval) +{ +	struct task_struct *p; +	unsigned int time_slice; +	unsigned long flags; +	struct rq *rq; +	int retval; +	struct timespec t; + +	if (pid < 0) +		return -EINVAL; + +	retval = -ESRCH; +	rcu_read_lock(); +	p = find_process_by_pid(pid); +	if (!p) +		goto out_unlock; + +	retval = security_task_getscheduler(p); +	if (retval) +		goto out_unlock; + +	rq = task_rq_lock(p, &flags); +	time_slice = p->sched_class->get_rr_interval(rq, p); +	task_rq_unlock(rq, p, &flags); + +	rcu_read_unlock(); +	jiffies_to_timespec(time_slice, &t); +	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +	return retval; + +out_unlock: +	rcu_read_unlock(); +	return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ +	unsigned long free = 0; +	unsigned state; + +	state = p->state ? __ffs(p->state) + 1 : 0; +	printk(KERN_INFO "%-15.15s %c", p->comm, +		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 +	if (state == TASK_RUNNING) +		printk(KERN_CONT " running  "); +	else +		printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else +	if (state == TASK_RUNNING) +		printk(KERN_CONT "  running task    "); +	else +		printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE +	free = stack_not_used(p); +#endif +	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, +		task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), +		(unsigned long)task_thread_info(p)->flags); + +	show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ +	struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 +	printk(KERN_INFO +		"  task                PC stack   pid father\n"); +#else +	printk(KERN_INFO +		"  task                        PC stack   pid father\n"); +#endif +	rcu_read_lock(); +	do_each_thread(g, p) { +		/* +		 * reset the NMI-timeout, listing all files on a slow +		 * console might take a lot of time: +		 */ +		touch_nmi_watchdog(); +		if (!state_filter || (p->state & state_filter)) +			sched_show_task(p); +	} while_each_thread(g, p); + +	touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG +	sysrq_sched_debug_show(); +#endif +	rcu_read_unlock(); +	/* +	 * Only show locks if all tasks are dumped: +	 */ +	if (!state_filter) +		debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ +	idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	__sched_fork(idle); +	idle->state = TASK_RUNNING; +	idle->se.exec_start = sched_clock(); + +	do_set_cpus_allowed(idle, cpumask_of(cpu)); +	/* +	 * We're having a chicken and egg problem, even though we are +	 * holding rq->lock, the cpu isn't yet set to this cpu so the +	 * lockdep check in task_group() will fail. +	 * +	 * Similar case to sched_fork(). / Alternatively we could +	 * use task_rq_lock() here and obtain the other rq->lock. +	 * +	 * Silence PROVE_RCU +	 */ +	rcu_read_lock(); +	__set_task_cpu(idle, cpu); +	rcu_read_unlock(); + +	rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) +	idle->on_cpu = 1; +#endif +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	/* Set the preempt count _outside_ the spinlocks! */ +	task_thread_info(idle)->preempt_count = 0; + +	/* +	 * The idle tasks have their own, simple scheduling class: +	 */ +	idle->sched_class = &idle_sched_class; +	ftrace_graph_init_idle_task(idle, cpu); +#if defined(CONFIG_SMP) +	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); +#endif +} + +#ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ +	if (p->sched_class && p->sched_class->set_cpus_allowed) +		p->sched_class->set_cpus_allowed(p, new_mask); + +	cpumask_copy(&p->cpus_allowed, new_mask); +	p->rt.nr_cpus_allowed = cpumask_weight(new_mask); +} + +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + *    stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + *    off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + *    it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + *    is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ +	unsigned long flags; +	struct rq *rq; +	unsigned int dest_cpu; +	int ret = 0; + +	rq = task_rq_lock(p, &flags); + +	if (cpumask_equal(&p->cpus_allowed, new_mask)) +		goto out; + +	if (!cpumask_intersects(new_mask, cpu_active_mask)) { +		ret = -EINVAL; +		goto out; +	} + +	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { +		ret = -EINVAL; +		goto out; +	} + +	do_set_cpus_allowed(p, new_mask); + +	/* Can the task run on the task's current CPU? If so, we're done */ +	if (cpumask_test_cpu(task_cpu(p), new_mask)) +		goto out; + +	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); +	if (p->on_rq) { +		struct migration_arg arg = { p, dest_cpu }; +		/* Need help from migration thread: drop lock and wait. */ +		task_rq_unlock(rq, p, &flags); +		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); +		tlb_migrate_finish(p->mm); +		return 0; +	} +out: +	task_rq_unlock(rq, p, &flags); + +	return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ +	struct rq *rq_dest, *rq_src; +	int ret = 0; + +	if (unlikely(!cpu_active(dest_cpu))) +		return ret; + +	rq_src = cpu_rq(src_cpu); +	rq_dest = cpu_rq(dest_cpu); + +	raw_spin_lock(&p->pi_lock); +	double_rq_lock(rq_src, rq_dest); +	/* Already moved. */ +	if (task_cpu(p) != src_cpu) +		goto done; +	/* Affinity changed (again). */ +	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) +		goto fail; + +	/* +	 * If we're not on a rq, the next wake-up will ensure we're +	 * placed properly. +	 */ +	if (p->on_rq) { +		deactivate_task(rq_src, p, 0); +		set_task_cpu(p, dest_cpu); +		activate_task(rq_dest, p, 0); +		check_preempt_curr(rq_dest, p, 0); +	} +done: +	ret = 1; +fail: +	double_rq_unlock(rq_src, rq_dest); +	raw_spin_unlock(&p->pi_lock); +	return ret; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ +	struct migration_arg *arg = data; + +	/* +	 * The original target cpu might have gone down and we might +	 * be on another cpu but it doesn't matter. +	 */ +	local_irq_disable(); +	__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); +	local_irq_enable(); +	return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ +	struct mm_struct *mm = current->active_mm; + +	BUG_ON(cpu_online(smp_processor_id())); + +	if (mm != &init_mm) +		switch_mm(mm, &init_mm, current); +	mmdrop(mm); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(struct rq *rq_src) +{ +	struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); + +	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; +	rq_src->nr_uninterruptible = 0; +} + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ +	atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +	rq->calc_load_active = 0; +} + +/* + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. + */ +static void migrate_tasks(unsigned int dead_cpu) +{ +	struct rq *rq = cpu_rq(dead_cpu); +	struct task_struct *next, *stop = rq->stop; +	int dest_cpu; + +	/* +	 * Fudge the rq selection such that the below task selection loop +	 * doesn't get stuck on the currently eligible stop task. +	 * +	 * We're currently inside stop_machine() and the rq is either stuck +	 * in the stop_machine_cpu_stop() loop, or we're executing this code, +	 * either way we should never end up calling schedule() until we're +	 * done here. +	 */ +	rq->stop = NULL; + +	/* Ensure any throttled groups are reachable by pick_next_task */ +	unthrottle_offline_cfs_rqs(rq); + +	for ( ; ; ) { +		/* +		 * There's this thread running, bail when that's the only +		 * remaining thread. +		 */ +		if (rq->nr_running == 1) +			break; + +		next = pick_next_task(rq); +		BUG_ON(!next); +		next->sched_class->put_prev_task(rq, next); + +		/* Find suitable destination for @next, with force if needed. */ +		dest_cpu = select_fallback_rq(dead_cpu, next); +		raw_spin_unlock(&rq->lock); + +		__migrate_task(next, dead_cpu, dest_cpu); + +		raw_spin_lock(&rq->lock); +	} + +	rq->stop = stop; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { +	{ +		.procname	= "sched_domain", +		.mode		= 0555, +	}, +	{} +}; + +static struct ctl_table sd_ctl_root[] = { +	{ +		.procname	= "kernel", +		.mode		= 0555, +		.child		= sd_ctl_dir, +	}, +	{} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ +	struct ctl_table *entry = +		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + +	return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ +	struct ctl_table *entry; + +	/* +	 * In the intermediate directories, both the child directory and +	 * procname are dynamically allocated and could fail but the mode +	 * will always be set. In the lowest directory the names are +	 * static strings and all have proc handlers. +	 */ +	for (entry = *tablep; entry->mode; entry++) { +		if (entry->child) +			sd_free_ctl_entry(&entry->child); +		if (entry->proc_handler == NULL) +			kfree(entry->procname); +	} + +	kfree(*tablep); +	*tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, +		const char *procname, void *data, int maxlen, +		umode_t mode, proc_handler *proc_handler) +{ +	entry->procname = procname; +	entry->data = data; +	entry->maxlen = maxlen; +	entry->mode = mode; +	entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ +	struct ctl_table *table = sd_alloc_ctl_entry(13); + +	if (table == NULL) +		return NULL; + +	set_table_entry(&table[0], "min_interval", &sd->min_interval, +		sizeof(long), 0644, proc_doulongvec_minmax); +	set_table_entry(&table[1], "max_interval", &sd->max_interval, +		sizeof(long), 0644, proc_doulongvec_minmax); +	set_table_entry(&table[2], "busy_idx", &sd->busy_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[3], "idle_idx", &sd->idle_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[5], "wake_idx", &sd->wake_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[7], "busy_factor", &sd->busy_factor, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[9], "cache_nice_tries", +		&sd->cache_nice_tries, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[10], "flags", &sd->flags, +		sizeof(int), 0644, proc_dointvec_minmax); +	set_table_entry(&table[11], "name", sd->name, +		CORENAME_MAX_SIZE, 0444, proc_dostring); +	/* &table[12] is terminator */ + +	return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ +	struct ctl_table *entry, *table; +	struct sched_domain *sd; +	int domain_num = 0, i; +	char buf[32]; + +	for_each_domain(cpu, sd) +		domain_num++; +	entry = table = sd_alloc_ctl_entry(domain_num + 1); +	if (table == NULL) +		return NULL; + +	i = 0; +	for_each_domain(cpu, sd) { +		snprintf(buf, 32, "domain%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_domain_table(sd); +		entry++; +		i++; +	} +	return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ +	int i, cpu_num = num_possible_cpus(); +	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); +	char buf[32]; + +	WARN_ON(sd_ctl_dir[0].child); +	sd_ctl_dir[0].child = entry; + +	if (entry == NULL) +		return; + +	for_each_possible_cpu(i) { +		snprintf(buf, 32, "cpu%d", i); +		entry->procname = kstrdup(buf, GFP_KERNEL); +		entry->mode = 0555; +		entry->child = sd_alloc_ctl_cpu_table(i); +		entry++; +	} + +	WARN_ON(sd_sysctl_header); +	sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ +	if (sd_sysctl_header) +		unregister_sysctl_table(sd_sysctl_header); +	sd_sysctl_header = NULL; +	if (sd_ctl_dir[0].child) +		sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ +	if (!rq->online) { +		const struct sched_class *class; + +		cpumask_set_cpu(rq->cpu, rq->rd->online); +		rq->online = 1; + +		for_each_class(class) { +			if (class->rq_online) +				class->rq_online(rq); +		} +	} +} + +static void set_rq_offline(struct rq *rq) +{ +	if (rq->online) { +		const struct sched_class *class; + +		for_each_class(class) { +			if (class->rq_offline) +				class->rq_offline(rq); +		} + +		cpumask_clear_cpu(rq->cpu, rq->rd->online); +		rq->online = 0; +	} +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ +	int cpu = (long)hcpu; +	unsigned long flags; +	struct rq *rq = cpu_rq(cpu); + +	switch (action & ~CPU_TASKS_FROZEN) { + +	case CPU_UP_PREPARE: +		rq->calc_load_update = calc_load_update; +		break; + +	case CPU_ONLINE: +		/* Update our root-domain */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->rd) { +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + +			set_rq_online(rq); +		} +		raw_spin_unlock_irqrestore(&rq->lock, flags); +		break; + +#ifdef CONFIG_HOTPLUG_CPU +	case CPU_DYING: +		sched_ttwu_pending(); +		/* Update our root-domain */ +		raw_spin_lock_irqsave(&rq->lock, flags); +		if (rq->rd) { +			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +			set_rq_offline(rq); +		} +		migrate_tasks(cpu); +		BUG_ON(rq->nr_running != 1); /* the migration thread */ +		raw_spin_unlock_irqrestore(&rq->lock, flags); + +		migrate_nr_uninterruptible(rq); +		calc_global_load_remove(rq); +		break; +#endif +	} + +	update_max_interval(); + +	return NOTIFY_OK; +} + +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else.  This has to be lower priority than + * the notifier in the perf_event subsystem, though. + */ +static struct notifier_block __cpuinitdata migration_notifier = { +	.notifier_call = migration_call, +	.priority = CPU_PRI_MIGRATION, +}; + +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +				      unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +	case CPU_DOWN_FAILED: +		set_cpu_active((long)hcpu, true); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +					unsigned long action, void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		set_cpu_active((long)hcpu, false); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int __init migration_init(void) +{ +	void *cpu = (void *)(long)smp_processor_id(); +	int err; + +	/* Initialize migration for the boot CPU */ +	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); +	BUG_ON(err == NOTIFY_BAD); +	migration_call(&migration_notifier, CPU_ONLINE, cpu); +	register_cpu_notifier(&migration_notifier); + +	/* Register cpu active notifiers */ +	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); +	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + +	return 0; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ +	sched_domain_debug_enabled = 1; + +	return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, +				  struct cpumask *groupmask) +{ +	struct sched_group *group = sd->groups; +	char str[256]; + +	cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); +	cpumask_clear(groupmask); + +	printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + +	if (!(sd->flags & SD_LOAD_BALANCE)) { +		printk("does not load-balance\n"); +		if (sd->parent) +			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" +					" has parent"); +		return -1; +	} + +	printk(KERN_CONT "span %s level %s\n", str, sd->name); + +	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { +		printk(KERN_ERR "ERROR: domain->span does not contain " +				"CPU%d\n", cpu); +	} +	if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { +		printk(KERN_ERR "ERROR: domain->groups does not contain" +				" CPU%d\n", cpu); +	} + +	printk(KERN_DEBUG "%*s groups:", level + 1, ""); +	do { +		if (!group) { +			printk("\n"); +			printk(KERN_ERR "ERROR: group is NULL\n"); +			break; +		} + +		if (!group->sgp->power) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: domain->cpu_power not " +					"set\n"); +			break; +		} + +		if (!cpumask_weight(sched_group_cpus(group))) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: empty group\n"); +			break; +		} + +		if (cpumask_intersects(groupmask, sched_group_cpus(group))) { +			printk(KERN_CONT "\n"); +			printk(KERN_ERR "ERROR: repeated CPUs\n"); +			break; +		} + +		cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + +		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + +		printk(KERN_CONT " %s", str); +		if (group->sgp->power != SCHED_POWER_SCALE) { +			printk(KERN_CONT " (cpu_power = %d)", +				group->sgp->power); +		} + +		group = group->next; +	} while (group != sd->groups); +	printk(KERN_CONT "\n"); + +	if (!cpumask_equal(sched_domain_span(sd), groupmask)) +		printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + +	if (sd->parent && +	    !cpumask_subset(groupmask, sched_domain_span(sd->parent))) +		printk(KERN_ERR "ERROR: parent span is not a superset " +			"of domain->span\n"); +	return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ +	int level = 0; + +	if (!sched_domain_debug_enabled) +		return; + +	if (!sd) { +		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); +		return; +	} + +	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + +	for (;;) { +		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) +			break; +		level++; +		sd = sd->parent; +		if (!sd) +			break; +	} +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ +	if (cpumask_weight(sched_domain_span(sd)) == 1) +		return 1; + +	/* Following flags need at least 2 groups */ +	if (sd->flags & (SD_LOAD_BALANCE | +			 SD_BALANCE_NEWIDLE | +			 SD_BALANCE_FORK | +			 SD_BALANCE_EXEC | +			 SD_SHARE_CPUPOWER | +			 SD_SHARE_PKG_RESOURCES)) { +		if (sd->groups != sd->groups->next) +			return 0; +	} + +	/* Following flags don't use groups */ +	if (sd->flags & (SD_WAKE_AFFINE)) +		return 0; + +	return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ +	unsigned long cflags = sd->flags, pflags = parent->flags; + +	if (sd_degenerate(parent)) +		return 1; + +	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) +		return 0; + +	/* Flags needing groups don't count if only 1 group in parent */ +	if (parent->groups == parent->groups->next) { +		pflags &= ~(SD_LOAD_BALANCE | +				SD_BALANCE_NEWIDLE | +				SD_BALANCE_FORK | +				SD_BALANCE_EXEC | +				SD_SHARE_CPUPOWER | +				SD_SHARE_PKG_RESOURCES); +		if (nr_node_ids == 1) +			pflags &= ~SD_SERIALIZE; +	} +	if (~cflags & pflags) +		return 0; + +	return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ +	struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + +	cpupri_cleanup(&rd->cpupri); +	free_cpumask_var(rd->rto_mask); +	free_cpumask_var(rd->online); +	free_cpumask_var(rd->span); +	kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ +	struct root_domain *old_rd = NULL; +	unsigned long flags; + +	raw_spin_lock_irqsave(&rq->lock, flags); + +	if (rq->rd) { +		old_rd = rq->rd; + +		if (cpumask_test_cpu(rq->cpu, old_rd->online)) +			set_rq_offline(rq); + +		cpumask_clear_cpu(rq->cpu, old_rd->span); + +		/* +		 * If we dont want to free the old_rt yet then +		 * set old_rd to NULL to skip the freeing later +		 * in this function: +		 */ +		if (!atomic_dec_and_test(&old_rd->refcount)) +			old_rd = NULL; +	} + +	atomic_inc(&rd->refcount); +	rq->rd = rd; + +	cpumask_set_cpu(rq->cpu, rd->span); +	if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) +		set_rq_online(rq); + +	raw_spin_unlock_irqrestore(&rq->lock, flags); + +	if (old_rd) +		call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ +	memset(rd, 0, sizeof(*rd)); + +	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) +		goto out; +	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) +		goto free_span; +	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) +		goto free_online; + +	if (cpupri_init(&rd->cpupri) != 0) +		goto free_rto_mask; +	return 0; + +free_rto_mask: +	free_cpumask_var(rd->rto_mask); +free_online: +	free_cpumask_var(rd->online); +free_span: +	free_cpumask_var(rd->span); +out: +	return -ENOMEM; +} + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + +static void init_defrootdomain(void) +{ +	init_rootdomain(&def_root_domain); + +	atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ +	struct root_domain *rd; + +	rd = kmalloc(sizeof(*rd), GFP_KERNEL); +	if (!rd) +		return NULL; + +	if (init_rootdomain(rd) != 0) { +		kfree(rd); +		return NULL; +	} + +	return rd; +} + +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ +	struct sched_group *tmp, *first; + +	if (!sg) +		return; + +	first = sg; +	do { +		tmp = sg->next; + +		if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) +			kfree(sg->sgp); + +		kfree(sg); +		sg = tmp; +	} while (sg != first); +} + +static void free_sched_domain(struct rcu_head *rcu) +{ +	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + +	/* +	 * If its an overlapping domain it has private groups, iterate and +	 * nuke them all. +	 */ +	if (sd->flags & SD_OVERLAP) { +		free_sched_groups(sd->groups, 1); +	} else if (atomic_dec_and_test(&sd->groups->ref)) { +		kfree(sd->groups->sgp); +		kfree(sd->groups); +	} +	kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ +	call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ +	for (; sd; sd = sd->parent) +		destroy_sched_domain(sd, cpu); +} + +/* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first cpu number in + * the cpumask of the domain), this allows us to quickly tell if + * two cpus are in the same cache domain, see ttwu_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_id); + +static void update_top_cache_domain(int cpu) +{ +	struct sched_domain *sd; +	int id = cpu; + +	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); +	if (sd) +		id = cpumask_first(sched_domain_span(sd)); + +	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); +	per_cpu(sd_llc_id, cpu) = id; +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ +	struct rq *rq = cpu_rq(cpu); +	struct sched_domain *tmp; + +	/* Remove the sched domains which do not contribute to scheduling. */ +	for (tmp = sd; tmp; ) { +		struct sched_domain *parent = tmp->parent; +		if (!parent) +			break; + +		if (sd_parent_degenerate(tmp, parent)) { +			tmp->parent = parent->parent; +			if (parent->parent) +				parent->parent->child = tmp; +			destroy_sched_domain(parent, cpu); +		} else +			tmp = tmp->parent; +	} + +	if (sd && sd_degenerate(sd)) { +		tmp = sd; +		sd = sd->parent; +		destroy_sched_domain(tmp, cpu); +		if (sd) +			sd->child = NULL; +	} + +	sched_domain_debug(sd, cpu); + +	rq_attach_root(rq, rd); +	tmp = rq->sd; +	rcu_assign_pointer(rq->sd, sd); +	destroy_sched_domains(tmp, cpu); + +	update_top_cache_domain(cpu); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ +	alloc_bootmem_cpumask_var(&cpu_isolated_map); +	cpulist_parse(str, cpu_isolated_map); +	return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, nodemask_t *used_nodes) +{ +	int i, n, val, min_val, best_node = -1; + +	min_val = INT_MAX; + +	for (i = 0; i < nr_node_ids; i++) { +		/* Start at @node */ +		n = (node + i) % nr_node_ids; + +		if (!nr_cpus_node(n)) +			continue; + +		/* Skip already used nodes */ +		if (node_isset(n, *used_nodes)) +			continue; + +		/* Simple min distance search */ +		val = node_distance(node, n); + +		if (val < min_val) { +			min_val = val; +			best_node = n; +		} +	} + +	if (best_node != -1) +		node_set(best_node, *used_nodes); +	return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @span: resulting cpumask + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static void sched_domain_node_span(int node, struct cpumask *span) +{ +	nodemask_t used_nodes; +	int i; + +	cpumask_clear(span); +	nodes_clear(used_nodes); + +	cpumask_or(span, span, cpumask_of_node(node)); +	node_set(node, used_nodes); + +	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { +		int next_node = find_next_best_node(node, &used_nodes); +		if (next_node < 0) +			break; +		cpumask_or(span, span, cpumask_of_node(next_node)); +	} +} + +static const struct cpumask *cpu_node_mask(int cpu) +{ +	lockdep_assert_held(&sched_domains_mutex); + +	sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + +	return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ +	return cpu_possible_mask; +} +#endif /* CONFIG_NUMA */ + +static const struct cpumask *cpu_cpu_mask(int cpu) +{ +	return cpumask_of_node(cpu_to_node(cpu)); +} + +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + +struct sd_data { +	struct sched_domain **__percpu sd; +	struct sched_group **__percpu sg; +	struct sched_group_power **__percpu sgp; +}; + +struct s_data { +	struct sched_domain ** __percpu sd; +	struct root_domain	*rd; +}; + +enum s_alloc { +	sa_rootdomain, +	sa_sd, +	sa_sd_storage, +	sa_none, +}; + +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +#define SDTL_OVERLAP	0x01 + +struct sched_domain_topology_level { +	sched_domain_init_f init; +	sched_domain_mask_f mask; +	int		    flags; +	struct sd_data      data; +}; + +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ +	struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; +	const struct cpumask *span = sched_domain_span(sd); +	struct cpumask *covered = sched_domains_tmpmask; +	struct sd_data *sdd = sd->private; +	struct sched_domain *child; +	int i; + +	cpumask_clear(covered); + +	for_each_cpu(i, span) { +		struct cpumask *sg_span; + +		if (cpumask_test_cpu(i, covered)) +			continue; + +		sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), +				GFP_KERNEL, cpu_to_node(cpu)); + +		if (!sg) +			goto fail; + +		sg_span = sched_group_cpus(sg); + +		child = *per_cpu_ptr(sdd->sd, i); +		if (child->child) { +			child = child->child; +			cpumask_copy(sg_span, sched_domain_span(child)); +		} else +			cpumask_set_cpu(i, sg_span); + +		cpumask_or(covered, covered, sg_span); + +		sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); +		atomic_inc(&sg->sgp->ref); + +		if (cpumask_test_cpu(cpu, sg_span)) +			groups = sg; + +		if (!first) +			first = sg; +		if (last) +			last->next = sg; +		last = sg; +		last->next = first; +	} +	sd->groups = groups; + +	return 0; + +fail: +	free_sched_groups(first, 0); + +	return -ENOMEM; +} + +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +{ +	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); +	struct sched_domain *child = sd->child; + +	if (child) +		cpu = cpumask_first(sched_domain_span(child)); + +	if (sg) { +		*sg = *per_cpu_ptr(sdd->sg, cpu); +		(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); +		atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ +	} + +	return cpu; +} + +/* + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed + */ +static int +build_sched_groups(struct sched_domain *sd, int cpu) +{ +	struct sched_group *first = NULL, *last = NULL; +	struct sd_data *sdd = sd->private; +	const struct cpumask *span = sched_domain_span(sd); +	struct cpumask *covered; +	int i; + +	get_group(cpu, sdd, &sd->groups); +	atomic_inc(&sd->groups->ref); + +	if (cpu != cpumask_first(sched_domain_span(sd))) +		return 0; + +	lockdep_assert_held(&sched_domains_mutex); +	covered = sched_domains_tmpmask; + +	cpumask_clear(covered); + +	for_each_cpu(i, span) { +		struct sched_group *sg; +		int group = get_group(i, sdd, &sg); +		int j; + +		if (cpumask_test_cpu(i, covered)) +			continue; + +		cpumask_clear(sched_group_cpus(sg)); +		sg->sgp->power = 0; + +		for_each_cpu(j, span) { +			if (get_group(j, sdd, NULL) != group) +				continue; + +			cpumask_set_cpu(j, covered); +			cpumask_set_cpu(j, sched_group_cpus(sg)); +		} + +		if (!first) +			first = sg; +		if (last) +			last->next = sg; +		last = sg; +	} +	last->next = first; + +	return 0; +} + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ +	struct sched_group *sg = sd->groups; + +	WARN_ON(!sd || !sg); + +	do { +		sg->group_weight = cpumask_weight(sched_group_cpus(sg)); +		sg = sg->next; +	} while (sg != sd->groups); + +	if (cpu != group_first_cpu(sg)) +		return; + +	update_group_power(sd, cpu); +	atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ +       return 0*SD_ASYM_PACKING; +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type)		sd->name = #type +#else +# define SD_INIT_NAME(sd, type)		do { } while (0) +#endif + +#define SD_INIT_FUNC(type)						\ +static noinline struct sched_domain *					\ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\ +{									\ +	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);	\ +	*sd = SD_##type##_INIT;						\ +	SD_INIT_NAME(sd, type);						\ +	sd->private = &tl->data;					\ +	return sd;							\ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ +	unsigned long val; + +	val = simple_strtoul(str, NULL, 0); +	if (val < sched_domain_level_max) +		default_relax_domain_level = val; + +	return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, +				 struct sched_domain_attr *attr) +{ +	int request; + +	if (!attr || attr->relax_domain_level < 0) { +		if (default_relax_domain_level < 0) +			return; +		else +			request = default_relax_domain_level; +	} else +		request = attr->relax_domain_level; +	if (request < sd->level) { +		/* turn off idle balance on this domain */ +		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); +	} else { +		/* turn on idle balance on this domain */ +		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); +	} +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, +				 const struct cpumask *cpu_map) +{ +	switch (what) { +	case sa_rootdomain: +		if (!atomic_read(&d->rd->refcount)) +			free_rootdomain(&d->rd->rcu); /* fall through */ +	case sa_sd: +		free_percpu(d->sd); /* fall through */ +	case sa_sd_storage: +		__sdt_free(cpu_map); /* fall through */ +	case sa_none: +		break; +	} +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, +						   const struct cpumask *cpu_map) +{ +	memset(d, 0, sizeof(*d)); + +	if (__sdt_alloc(cpu_map)) +		return sa_sd_storage; +	d->sd = alloc_percpu(struct sched_domain *); +	if (!d->sd) +		return sa_sd_storage; +	d->rd = alloc_rootdomain(); +	if (!d->rd) +		return sa_sd; +	return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ +	struct sd_data *sdd = sd->private; + +	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); +	*per_cpu_ptr(sdd->sd, cpu) = NULL; + +	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) +		*per_cpu_ptr(sdd->sg, cpu) = NULL; + +	if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) +		*per_cpu_ptr(sdd->sgp, cpu) = NULL; +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) +{ +	return topology_thread_cpumask(cpu); +} +#endif + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT +	{ sd_init_SIBLING, cpu_smt_mask, }, +#endif +#ifdef CONFIG_SCHED_MC +	{ sd_init_MC, cpu_coregroup_mask, }, +#endif +#ifdef CONFIG_SCHED_BOOK +	{ sd_init_BOOK, cpu_book_mask, }, +#endif +	{ sd_init_CPU, cpu_cpu_mask, }, +#ifdef CONFIG_NUMA +	{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, +	{ sd_init_ALLNODES, cpu_allnodes_mask, }, +#endif +	{ NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ +	struct sched_domain_topology_level *tl; +	int j; + +	for (tl = sched_domain_topology; tl->init; tl++) { +		struct sd_data *sdd = &tl->data; + +		sdd->sd = alloc_percpu(struct sched_domain *); +		if (!sdd->sd) +			return -ENOMEM; + +		sdd->sg = alloc_percpu(struct sched_group *); +		if (!sdd->sg) +			return -ENOMEM; + +		sdd->sgp = alloc_percpu(struct sched_group_power *); +		if (!sdd->sgp) +			return -ENOMEM; + +		for_each_cpu(j, cpu_map) { +			struct sched_domain *sd; +			struct sched_group *sg; +			struct sched_group_power *sgp; + +		       	sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sd) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sd, j) = sd; + +			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sg) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sg, j) = sg; + +			sgp = kzalloc_node(sizeof(struct sched_group_power), +					GFP_KERNEL, cpu_to_node(j)); +			if (!sgp) +				return -ENOMEM; + +			*per_cpu_ptr(sdd->sgp, j) = sgp; +		} +	} + +	return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ +	struct sched_domain_topology_level *tl; +	int j; + +	for (tl = sched_domain_topology; tl->init; tl++) { +		struct sd_data *sdd = &tl->data; + +		for_each_cpu(j, cpu_map) { +			struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); +			if (sd && (sd->flags & SD_OVERLAP)) +				free_sched_groups(sd->groups, 0); +			kfree(*per_cpu_ptr(sdd->sd, j)); +			kfree(*per_cpu_ptr(sdd->sg, j)); +			kfree(*per_cpu_ptr(sdd->sgp, j)); +		} +		free_percpu(sdd->sd); +		free_percpu(sdd->sg); +		free_percpu(sdd->sgp); +	} +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, +		struct s_data *d, const struct cpumask *cpu_map, +		struct sched_domain_attr *attr, struct sched_domain *child, +		int cpu) +{ +	struct sched_domain *sd = tl->init(tl, cpu); +	if (!sd) +		return child; + +	set_domain_attribute(sd, attr); +	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); +	if (child) { +		sd->level = child->level + 1; +		sched_domain_level_max = max(sched_domain_level_max, sd->level); +		child->parent = sd; +	} +	sd->child = child; + +	return sd; +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int build_sched_domains(const struct cpumask *cpu_map, +			       struct sched_domain_attr *attr) +{ +	enum s_alloc alloc_state = sa_none; +	struct sched_domain *sd; +	struct s_data d; +	int i, ret = -ENOMEM; + +	alloc_state = __visit_domain_allocation_hell(&d, cpu_map); +	if (alloc_state != sa_rootdomain) +		goto error; + +	/* Set up domains for cpus specified by the cpu_map. */ +	for_each_cpu(i, cpu_map) { +		struct sched_domain_topology_level *tl; + +		sd = NULL; +		for (tl = sched_domain_topology; tl->init; tl++) { +			sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); +			if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) +				sd->flags |= SD_OVERLAP; +			if (cpumask_equal(cpu_map, sched_domain_span(sd))) +				break; +		} + +		while (sd->child) +			sd = sd->child; + +		*per_cpu_ptr(d.sd, i) = sd; +	} + +	/* Build the groups for the domains */ +	for_each_cpu(i, cpu_map) { +		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { +			sd->span_weight = cpumask_weight(sched_domain_span(sd)); +			if (sd->flags & SD_OVERLAP) { +				if (build_overlap_sched_groups(sd, i)) +					goto error; +			} else { +				if (build_sched_groups(sd, i)) +					goto error; +			} +		} +	} + +	/* Calculate CPU power for physical packages and nodes */ +	for (i = nr_cpumask_bits-1; i >= 0; i--) { +		if (!cpumask_test_cpu(i, cpu_map)) +			continue; + +		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { +			claim_allocations(i, sd); +			init_sched_groups_power(i, sd); +		} +	} + +	/* Attach the domains */ +	rcu_read_lock(); +	for_each_cpu(i, cpu_map) { +		sd = *per_cpu_ptr(d.sd, i); +		cpu_attach_domain(sd, d.rd, i); +	} +	rcu_read_unlock(); + +	ret = 0; +error: +	__free_domain_allocs(&d, alloc_state, cpu_map); +	return ret; +} + +static cpumask_var_t *doms_cur;	/* current sched domains */ +static int ndoms_cur;		/* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; +				/* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ +	return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ +	int i; +	cpumask_var_t *doms; + +	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); +	if (!doms) +		return NULL; +	for (i = 0; i < ndoms; i++) { +		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { +			free_sched_domains(doms, i); +			return NULL; +		} +	} +	return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ +	unsigned int i; +	for (i = 0; i < ndoms; i++) +		free_cpumask_var(doms[i]); +	kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int init_sched_domains(const struct cpumask *cpu_map) +{ +	int err; + +	arch_update_cpu_topology(); +	ndoms_cur = 1; +	doms_cur = alloc_sched_domains(ndoms_cur); +	if (!doms_cur) +		doms_cur = &fallback_doms; +	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); +	dattr_cur = NULL; +	err = build_sched_domains(doms_cur[0], NULL); +	register_sched_domain_sysctl(); + +	return err; +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ +	int i; + +	rcu_read_lock(); +	for_each_cpu(i, cpu_map) +		cpu_attach_domain(NULL, &def_root_domain, i); +	rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, +			struct sched_domain_attr *new, int idx_new) +{ +	struct sched_domain_attr tmp; + +	/* fast path */ +	if (!new && !cur) +		return 1; + +	tmp = SD_ATTR_INIT; +	return !memcmp(cur ? (cur + idx_cur) : &tmp, +			new ? (new + idx_new) : &tmp, +			sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains.  This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], +			     struct sched_domain_attr *dattr_new) +{ +	int i, j, n; +	int new_topology; + +	mutex_lock(&sched_domains_mutex); + +	/* always unregister in case we don't destroy any domains */ +	unregister_sched_domain_sysctl(); + +	/* Let architecture update cpu core mappings. */ +	new_topology = arch_update_cpu_topology(); + +	n = doms_new ? ndoms_new : 0; + +	/* Destroy deleted domains */ +	for (i = 0; i < ndoms_cur; i++) { +		for (j = 0; j < n && !new_topology; j++) { +			if (cpumask_equal(doms_cur[i], doms_new[j]) +			    && dattrs_equal(dattr_cur, i, dattr_new, j)) +				goto match1; +		} +		/* no match - a current sched domain not in new doms_new[] */ +		detach_destroy_domains(doms_cur[i]); +match1: +		; +	} + +	if (doms_new == NULL) { +		ndoms_cur = 0; +		doms_new = &fallback_doms; +		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); +		WARN_ON_ONCE(dattr_new); +	} + +	/* Build new domains */ +	for (i = 0; i < ndoms_new; i++) { +		for (j = 0; j < ndoms_cur && !new_topology; j++) { +			if (cpumask_equal(doms_new[i], doms_cur[j]) +			    && dattrs_equal(dattr_new, i, dattr_cur, j)) +				goto match2; +		} +		/* no match - add a new doms_new */ +		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: +		; +	} + +	/* Remember the new sched domains */ +	if (doms_cur != &fallback_doms) +		free_sched_domains(doms_cur, ndoms_cur); +	kfree(dattr_cur);	/* kfree(NULL) is safe */ +	doms_cur = doms_new; +	dattr_cur = dattr_new; +	ndoms_cur = ndoms_new; + +	register_sched_domain_sysctl(); + +	mutex_unlock(&sched_domains_mutex); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +static void reinit_sched_domains(void) +{ +	get_online_cpus(); + +	/* Destroy domains first to force the rebuild */ +	partition_sched_domains(0, NULL, NULL); + +	rebuild_sched_domains(); +	put_online_cpus(); +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ +	unsigned int level = 0; + +	if (sscanf(buf, "%u", &level) != 1) +		return -EINVAL; + +	/* +	 * level is always be positive so don't check for +	 * level < POWERSAVINGS_BALANCE_NONE which is 0 +	 * What happens on 0 or 1 byte write, +	 * need to check for count as well? +	 */ + +	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) +		return -EINVAL; + +	if (smt) +		sched_smt_power_savings = level; +	else +		sched_mc_power_savings = level; + +	reinit_sched_domains(); + +	return count; +} + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct device *dev, +					   struct device_attribute *attr, +					   char *buf) +{ +	return sprintf(buf, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct device *dev, +					    struct device_attribute *attr, +					    const char *buf, size_t count) +{ +	return sched_power_savings_store(buf, count, 0); +} +static DEVICE_ATTR(sched_mc_power_savings, 0644, +		   sched_mc_power_savings_show, +		   sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct device *dev, +					    struct device_attribute *attr, +					    char *buf) +{ +	return sprintf(buf, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct device *dev, +					    struct device_attribute *attr, +					     const char *buf, size_t count) +{ +	return sched_power_savings_store(buf, count, 1); +} +static DEVICE_ATTR(sched_smt_power_savings, 0644, +		   sched_smt_power_savings_show, +		   sched_smt_power_savings_store); +#endif + +int __init sched_create_sysfs_power_savings_entries(struct device *dev) +{ +	int err = 0; + +#ifdef CONFIG_SCHED_SMT +	if (smt_capable()) +		err = device_create_file(dev, &dev_attr_sched_smt_power_savings); +#endif +#ifdef CONFIG_SCHED_MC +	if (!err && mc_capable()) +		err = device_create_file(dev, &dev_attr_sched_mc_power_savings); +#endif +	return err; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + +/* + * Update cpusets according to cpu_active mask.  If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). + */ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, +			     void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +	case CPU_DOWN_FAILED: +		cpuset_update_active_cpus(); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, +			       void *hcpu) +{ +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		cpuset_update_active_cpus(); +		return NOTIFY_OK; +	default: +		return NOTIFY_DONE; +	} +} + +void __init sched_init_smp(void) +{ +	cpumask_var_t non_isolated_cpus; + +	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); +	alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + +	get_online_cpus(); +	mutex_lock(&sched_domains_mutex); +	init_sched_domains(cpu_active_mask); +	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); +	if (cpumask_empty(non_isolated_cpus)) +		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); +	mutex_unlock(&sched_domains_mutex); +	put_online_cpus(); + +	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); +	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + +	/* RT runtime code needs to handle some hotplug events */ +	hotcpu_notifier(update_runtime, 0); + +	init_hrtick(); + +	/* Move init over to a non-isolated CPU */ +	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) +		BUG(); +	sched_init_granularity(); +	free_cpumask_var(non_isolated_cpus); + +	init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ +	sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +const_debug unsigned int sysctl_timer_migration = 1; + +int in_sched_functions(unsigned long addr) +{ +	return in_lock_functions(addr) || +		(addr >= (unsigned long)__sched_text_start +		&& addr < (unsigned long)__sched_text_end); +} + +#ifdef CONFIG_CGROUP_SCHED +struct task_group root_task_group; +#endif + +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + +void __init sched_init(void) +{ +	int i, j; +	unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED +	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED +	alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK +	alloc_size += num_possible_cpus() * cpumask_size(); +#endif +	if (alloc_size) { +		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + +#ifdef CONFIG_FAIR_GROUP_SCHED +		root_task_group.se = (struct sched_entity **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +		root_task_group.cfs_rq = (struct cfs_rq **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED +		root_task_group.rt_se = (struct sched_rt_entity **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +		root_task_group.rt_rq = (struct rt_rq **)ptr; +		ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK +		for_each_possible_cpu(i) { +			per_cpu(load_balance_tmpmask, i) = (void *)ptr; +			ptr += cpumask_size(); +		} +#endif /* CONFIG_CPUMASK_OFFSTACK */ +	} + +#ifdef CONFIG_SMP +	init_defrootdomain(); +#endif + +	init_rt_bandwidth(&def_rt_bandwidth, +			global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED +	init_rt_bandwidth(&root_task_group.rt_bandwidth, +			global_rt_period(), global_rt_runtime()); +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED +	list_add(&root_task_group.list, &task_groups); +	INIT_LIST_HEAD(&root_task_group.children); +	INIT_LIST_HEAD(&root_task_group.siblings); +	autogroup_init(&init_task); + +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT +	root_cpuacct.cpustat = &kernel_cpustat; +	root_cpuacct.cpuusage = alloc_percpu(u64); +	/* Too early, not expected to fail */ +	BUG_ON(!root_cpuacct.cpuusage); +#endif +	for_each_possible_cpu(i) { +		struct rq *rq; + +		rq = cpu_rq(i); +		raw_spin_lock_init(&rq->lock); +		rq->nr_running = 0; +		rq->calc_load_active = 0; +		rq->calc_load_update = jiffies + LOAD_FREQ; +		init_cfs_rq(&rq->cfs); +		init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED +		root_task_group.shares = ROOT_TASK_GROUP_LOAD; +		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +		/* +		 * How much cpu bandwidth does root_task_group get? +		 * +		 * In case of task-groups formed thr' the cgroup filesystem, it +		 * gets 100% of the cpu resources in the system. This overall +		 * system cpu resource is divided among the tasks of +		 * root_task_group and its child task-groups in a fair manner, +		 * based on each entity's (task or task-group's) weight +		 * (se->load.weight). +		 * +		 * In other words, if root_task_group has 10 tasks of weight +		 * 1024) and two child groups A0 and A1 (of weight 1024 each), +		 * then A0's share of the cpu resource is: +		 * +		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% +		 * +		 * We achieve this by letting root_task_group's tasks sit +		 * directly in rq->cfs (i.e root_task_group->se[] = NULL). +		 */ +		init_cfs_bandwidth(&root_task_group.cfs_bandwidth); +		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED +		INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); +#endif + +		for (j = 0; j < CPU_LOAD_IDX_MAX; j++) +			rq->cpu_load[j] = 0; + +		rq->last_load_update_tick = jiffies; + +#ifdef CONFIG_SMP +		rq->sd = NULL; +		rq->rd = NULL; +		rq->cpu_power = SCHED_POWER_SCALE; +		rq->post_schedule = 0; +		rq->active_balance = 0; +		rq->next_balance = jiffies; +		rq->push_cpu = 0; +		rq->cpu = i; +		rq->online = 0; +		rq->idle_stamp = 0; +		rq->avg_idle = 2*sysctl_sched_migration_cost; +		rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ +		rq->nohz_flags = 0; +#endif +#endif +		init_rq_hrtick(rq); +		atomic_set(&rq->nr_iowait, 0); +	} + +	set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS +	INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_RT_MUTEXES +	plist_head_init(&init_task.pi_waiters); +#endif + +	/* +	 * The boot idle thread does lazy MMU switching as well: +	 */ +	atomic_inc(&init_mm.mm_count); +	enter_lazy_tlb(&init_mm, current); + +	/* +	 * Make us the idle thread. Technically, schedule() should not be +	 * called from this thread, however somewhere below it might be, +	 * but because we are the idle thread, we just pick up running again +	 * when this runqueue becomes "idle". +	 */ +	init_idle(current, smp_processor_id()); + +	calc_load_update = jiffies + LOAD_FREQ; + +	/* +	 * During early bootup we pretend to be a normal task: +	 */ +	current->sched_class = &fair_sched_class; + +#ifdef CONFIG_SMP +	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); +	/* May be allocated at isolcpus cmdline parse time */ +	if (cpu_isolated_map == NULL) +		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); +#endif +	init_sched_fair_class(); + +	scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +static inline int preempt_count_equals(int preempt_offset) +{ +	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + +	return (nested == preempt_offset); +} + +void __might_sleep(const char *file, int line, int preempt_offset) +{ +	static unsigned long prev_jiffy;	/* ratelimiting */ + +	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ +	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || +	    system_state != SYSTEM_RUNNING || oops_in_progress) +		return; +	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +		return; +	prev_jiffy = jiffies; + +	printk(KERN_ERR +		"BUG: sleeping function called from invalid context at %s:%d\n", +			file, line); +	printk(KERN_ERR +		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", +			in_atomic(), irqs_disabled(), +			current->pid, current->comm); + +	debug_show_held_locks(current); +	if (irqs_disabled()) +		print_irqtrace_events(current); +	dump_stack(); +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ +	const struct sched_class *prev_class = p->sched_class; +	int old_prio = p->prio; +	int on_rq; + +	on_rq = p->on_rq; +	if (on_rq) +		deactivate_task(rq, p, 0); +	__setscheduler(rq, p, SCHED_NORMAL, 0); +	if (on_rq) { +		activate_task(rq, p, 0); +		resched_task(rq->curr); +	} + +	check_class_changed(rq, p, prev_class, old_prio); +} + +void normalize_rt_tasks(void) +{ +	struct task_struct *g, *p; +	unsigned long flags; +	struct rq *rq; + +	read_lock_irqsave(&tasklist_lock, flags); +	do_each_thread(g, p) { +		/* +		 * Only normalize user tasks: +		 */ +		if (!p->mm) +			continue; + +		p->se.exec_start		= 0; +#ifdef CONFIG_SCHEDSTATS +		p->se.statistics.wait_start	= 0; +		p->se.statistics.sleep_start	= 0; +		p->se.statistics.block_start	= 0; +#endif + +		if (!rt_task(p)) { +			/* +			 * Renice negative nice level userspace +			 * tasks back to 0: +			 */ +			if (TASK_NICE(p) < 0 && p->mm) +				set_user_nice(p, 0); +			continue; +		} + +		raw_spin_lock(&p->pi_lock); +		rq = __task_rq_lock(p); + +		normalize_task(rq, p); + +		__task_rq_unlock(rq); +		raw_spin_unlock(&p->pi_lock); +	} while_each_thread(g, p); + +	read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +/* + * These functions are only useful for the IA64 MCA handling, or kdb. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ +	return cpu_curr(cpu); +} + +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ +	cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + +static void free_sched_group(struct task_group *tg) +{ +	free_fair_sched_group(tg); +	free_rt_sched_group(tg); +	autogroup_free(tg); +	kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ +	struct task_group *tg; +	unsigned long flags; + +	tg = kzalloc(sizeof(*tg), GFP_KERNEL); +	if (!tg) +		return ERR_PTR(-ENOMEM); + +	if (!alloc_fair_sched_group(tg, parent)) +		goto err; + +	if (!alloc_rt_sched_group(tg, parent)) +		goto err; + +	spin_lock_irqsave(&task_group_lock, flags); +	list_add_rcu(&tg->list, &task_groups); + +	WARN_ON(!parent); /* root should already exist */ + +	tg->parent = parent; +	INIT_LIST_HEAD(&tg->children); +	list_add_rcu(&tg->siblings, &parent->children); +	spin_unlock_irqrestore(&task_group_lock, flags); + +	return tg; + +err: +	free_sched_group(tg); +	return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ +	/* now it should be safe to free those cfs_rqs */ +	free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ +	unsigned long flags; +	int i; + +	/* end participation in shares distribution */ +	for_each_possible_cpu(i) +		unregister_fair_sched_group(tg, i); + +	spin_lock_irqsave(&task_group_lock, flags); +	list_del_rcu(&tg->list); +	list_del_rcu(&tg->siblings); +	spin_unlock_irqrestore(&task_group_lock, flags); + +	/* wait for possible concurrent references to cfs_rqs complete */ +	call_rcu(&tg->rcu, free_sched_group_rcu); +} + +/* change task's runqueue when it moves between groups. + *	The caller of this function should have put the task in its new group + *	by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + *	reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ +	int on_rq, running; +	unsigned long flags; +	struct rq *rq; + +	rq = task_rq_lock(tsk, &flags); + +	running = task_current(rq, tsk); +	on_rq = tsk->on_rq; + +	if (on_rq) +		dequeue_task(rq, tsk, 0); +	if (unlikely(running)) +		tsk->sched_class->put_prev_task(rq, tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED +	if (tsk->sched_class->task_move_group) +		tsk->sched_class->task_move_group(tsk, on_rq); +	else +#endif +		set_task_rq(tsk, task_cpu(tsk)); + +	if (unlikely(running)) +		tsk->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, tsk, 0); + +	task_rq_unlock(rq, tsk, &flags); +} +#endif /* CONFIG_CGROUP_SCHED */ + +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) +static unsigned long to_ratio(u64 period, u64 runtime) +{ +	if (runtime == RUNTIME_INF) +		return 1ULL << 20; + +	return div64_u64(runtime << 20, period); +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ +	struct task_struct *g, *p; + +	do_each_thread(g, p) { +		if (rt_task(p) && task_rq(p)->rt.tg == tg) +			return 1; +	} while_each_thread(g, p); + +	return 0; +} + +struct rt_schedulable_data { +	struct task_group *tg; +	u64 rt_period; +	u64 rt_runtime; +}; + +static int tg_rt_schedulable(struct task_group *tg, void *data) +{ +	struct rt_schedulable_data *d = data; +	struct task_group *child; +	unsigned long total, sum = 0; +	u64 period, runtime; + +	period = ktime_to_ns(tg->rt_bandwidth.rt_period); +	runtime = tg->rt_bandwidth.rt_runtime; + +	if (tg == d->tg) { +		period = d->rt_period; +		runtime = d->rt_runtime; +	} + +	/* +	 * Cannot have more runtime than the period. +	 */ +	if (runtime > period && runtime != RUNTIME_INF) +		return -EINVAL; + +	/* +	 * Ensure we don't starve existing RT tasks. +	 */ +	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) +		return -EBUSY; + +	total = to_ratio(period, runtime); + +	/* +	 * Nobody can have more than the global setting allows. +	 */ +	if (total > to_ratio(global_rt_period(), global_rt_runtime())) +		return -EINVAL; + +	/* +	 * The sum of our children's runtime should not exceed our own. +	 */ +	list_for_each_entry_rcu(child, &tg->children, siblings) { +		period = ktime_to_ns(child->rt_bandwidth.rt_period); +		runtime = child->rt_bandwidth.rt_runtime; + +		if (child == d->tg) { +			period = d->rt_period; +			runtime = d->rt_runtime; +		} + +		sum += to_ratio(period, runtime); +	} + +	if (sum > total) +		return -EINVAL; + +	return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ +	int ret; + +	struct rt_schedulable_data data = { +		.tg = tg, +		.rt_period = period, +		.rt_runtime = runtime, +	}; + +	rcu_read_lock(); +	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); +	rcu_read_unlock(); + +	return ret; +} + +static int tg_set_rt_bandwidth(struct task_group *tg, +		u64 rt_period, u64 rt_runtime) +{ +	int i, err = 0; + +	mutex_lock(&rt_constraints_mutex); +	read_lock(&tasklist_lock); +	err = __rt_schedulable(tg, rt_period, rt_runtime); +	if (err) +		goto unlock; + +	raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); +	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); +	tg->rt_bandwidth.rt_runtime = rt_runtime; + +	for_each_possible_cpu(i) { +		struct rt_rq *rt_rq = tg->rt_rq[i]; + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		rt_rq->rt_runtime = rt_runtime; +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +	} +	raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: +	read_unlock(&tasklist_lock); +	mutex_unlock(&rt_constraints_mutex); + +	return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ +	u64 rt_runtime, rt_period; + +	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); +	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; +	if (rt_runtime_us < 0) +		rt_runtime = RUNTIME_INF; + +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ +	u64 rt_runtime_us; + +	if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) +		return -1; + +	rt_runtime_us = tg->rt_bandwidth.rt_runtime; +	do_div(rt_runtime_us, NSEC_PER_USEC); +	return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ +	u64 rt_runtime, rt_period; + +	rt_period = (u64)rt_period_us * NSEC_PER_USEC; +	rt_runtime = tg->rt_bandwidth.rt_runtime; + +	if (rt_period == 0) +		return -EINVAL; + +	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ +	u64 rt_period_us; + +	rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); +	do_div(rt_period_us, NSEC_PER_USEC); +	return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ +	u64 runtime, period; +	int ret = 0; + +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	runtime = global_rt_runtime(); +	period = global_rt_period(); + +	/* +	 * Sanity check on the sysctl variables. +	 */ +	if (runtime > period && runtime != RUNTIME_INF) +		return -EINVAL; + +	mutex_lock(&rt_constraints_mutex); +	read_lock(&tasklist_lock); +	ret = __rt_schedulable(NULL, 0, 0); +	read_unlock(&tasklist_lock); +	mutex_unlock(&rt_constraints_mutex); + +	return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ +	/* Don't accept realtime tasks when there is no way for them to run */ +	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) +		return 0; + +	return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ +	unsigned long flags; +	int i; + +	if (sysctl_sched_rt_period <= 0) +		return -EINVAL; + +	/* +	 * There's always some RT tasks in the root group +	 * -- migration, kstopmachine etc.. +	 */ +	if (sysctl_sched_rt_runtime == 0) +		return -EBUSY; + +	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +	for_each_possible_cpu(i) { +		struct rt_rq *rt_rq = &cpu_rq(i)->rt; + +		raw_spin_lock(&rt_rq->rt_runtime_lock); +		rt_rq->rt_runtime = global_rt_runtime(); +		raw_spin_unlock(&rt_rq->rt_runtime_lock); +	} +	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + +	return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, +		void __user *buffer, size_t *lenp, +		loff_t *ppos) +{ +	int ret; +	int old_period, old_runtime; +	static DEFINE_MUTEX(mutex); + +	mutex_lock(&mutex); +	old_period = sysctl_sched_rt_period; +	old_runtime = sysctl_sched_rt_runtime; + +	ret = proc_dointvec(table, write, buffer, lenp, ppos); + +	if (!ret && write) { +		ret = sched_rt_global_constraints(); +		if (ret) { +			sysctl_sched_rt_period = old_period; +			sysctl_sched_rt_runtime = old_runtime; +		} else { +			def_rt_bandwidth.rt_runtime = global_rt_runtime(); +			def_rt_bandwidth.rt_period = +				ns_to_ktime(global_rt_period()); +		} +	} +	mutex_unlock(&mutex); + +	return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ +	return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), +			    struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct task_group *tg, *parent; + +	if (!cgrp->parent) { +		/* This is early initialization for the top cgroup */ +		return &root_task_group.css; +	} + +	parent = cgroup_tg(cgrp->parent); +	tg = sched_create_group(parent); +	if (IS_ERR(tg)) +		return ERR_PTR(-ENOMEM); + +	return &tg->css; +} + +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct task_group *tg = cgroup_tg(cgrp); + +	sched_destroy_group(tg); +} + +static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +				 struct cgroup_taskset *tset) +{ +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) { +#ifdef CONFIG_RT_GROUP_SCHED +		if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) +			return -EINVAL; +#else +		/* We don't support RT-tasks being in separate groups */ +		if (task->sched_class != &fair_sched_class) +			return -EINVAL; +#endif +	} +	return 0; +} + +static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +			      struct cgroup_taskset *tset) +{ +	struct task_struct *task; + +	cgroup_taskset_for_each(task, cgrp, tset) +		sched_move_task(task); +} + +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, +		struct cgroup *old_cgrp, struct task_struct *task) +{ +	/* +	 * cgroup_exit() is called in the copy_process() failure path. +	 * Ignore this case since the task hasn't ran yet, this avoids +	 * trying to poke a half freed task state from generic code. +	 */ +	if (!(task->flags & PF_EXITING)) +		return; + +	sched_move_task(task); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, +				u64 shareval) +{ +	return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ +	struct task_group *tg = cgroup_tg(cgrp); + +	return (u64) scale_load_down(tg->shares); +} + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ +	int i, ret = 0, runtime_enabled, runtime_was_enabled; +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + +	if (tg == &root_task_group) +		return -EINVAL; + +	/* +	 * Ensure we have at some amount of bandwidth every period.  This is +	 * to prevent reaching a state of large arrears when throttled via +	 * entity_tick() resulting in prolonged exit starvation. +	 */ +	if (quota < min_cfs_quota_period || period < min_cfs_quota_period) +		return -EINVAL; + +	/* +	 * Likewise, bound things on the otherside by preventing insane quota +	 * periods.  This also allows us to normalize in computing quota +	 * feasibility. +	 */ +	if (period > max_cfs_quota_period) +		return -EINVAL; + +	mutex_lock(&cfs_constraints_mutex); +	ret = __cfs_schedulable(tg, period, quota); +	if (ret) +		goto out_unlock; + +	runtime_enabled = quota != RUNTIME_INF; +	runtime_was_enabled = cfs_b->quota != RUNTIME_INF; +	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); +	raw_spin_lock_irq(&cfs_b->lock); +	cfs_b->period = ns_to_ktime(period); +	cfs_b->quota = quota; + +	__refill_cfs_bandwidth_runtime(cfs_b); +	/* restart the period timer (if active) to handle new period expiry */ +	if (runtime_enabled && cfs_b->timer_active) { +		/* force a reprogram */ +		cfs_b->timer_active = 0; +		__start_cfs_bandwidth(cfs_b); +	} +	raw_spin_unlock_irq(&cfs_b->lock); + +	for_each_possible_cpu(i) { +		struct cfs_rq *cfs_rq = tg->cfs_rq[i]; +		struct rq *rq = cfs_rq->rq; + +		raw_spin_lock_irq(&rq->lock); +		cfs_rq->runtime_enabled = runtime_enabled; +		cfs_rq->runtime_remaining = 0; + +		if (cfs_rq->throttled) +			unthrottle_cfs_rq(cfs_rq); +		raw_spin_unlock_irq(&rq->lock); +	} +out_unlock: +	mutex_unlock(&cfs_constraints_mutex); + +	return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ +	u64 quota, period; + +	period = ktime_to_ns(tg->cfs_bandwidth.period); +	if (cfs_quota_us < 0) +		quota = RUNTIME_INF; +	else +		quota = (u64)cfs_quota_us * NSEC_PER_USEC; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ +	u64 quota_us; + +	if (tg->cfs_bandwidth.quota == RUNTIME_INF) +		return -1; + +	quota_us = tg->cfs_bandwidth.quota; +	do_div(quota_us, NSEC_PER_USEC); + +	return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ +	u64 quota, period; + +	period = (u64)cfs_period_us * NSEC_PER_USEC; +	quota = tg->cfs_bandwidth.quota; + +	return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ +	u64 cfs_period_us; + +	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); +	do_div(cfs_period_us, NSEC_PER_USEC); + +	return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, +				s64 cfs_quota_us) +{ +	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ +	return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, +				u64 cfs_period_us) +{ +	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { +	struct task_group *tg; +	u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, +			       struct cfs_schedulable_data *d) +{ +	u64 quota, period; + +	if (tg == d->tg) { +		period = d->period; +		quota = d->quota; +	} else { +		period = tg_get_cfs_period(tg); +		quota = tg_get_cfs_quota(tg); +	} + +	/* note: these should typically be equivalent */ +	if (quota == RUNTIME_INF || quota == -1) +		return RUNTIME_INF; + +	return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ +	struct cfs_schedulable_data *d = data; +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; +	s64 quota = 0, parent_quota = -1; + +	if (!tg->parent) { +		quota = RUNTIME_INF; +	} else { +		struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; + +		quota = normalize_cfs_quota(tg, d); +		parent_quota = parent_b->hierarchal_quota; + +		/* +		 * ensure max(child_quota) <= parent_quota, inherit when no +		 * limit is set +		 */ +		if (quota == RUNTIME_INF) +			quota = parent_quota; +		else if (parent_quota != RUNTIME_INF && quota > parent_quota) +			return -EINVAL; +	} +	cfs_b->hierarchal_quota = quota; + +	return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ +	int ret; +	struct cfs_schedulable_data data = { +		.tg = tg, +		.period = period, +		.quota = quota, +	}; + +	if (quota != RUNTIME_INF) { +		do_div(data.period, NSEC_PER_USEC); +		do_div(data.quota, NSEC_PER_USEC); +	} + +	rcu_read_lock(); +	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); +	rcu_read_unlock(); + +	return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +		struct cgroup_map_cb *cb) +{ +	struct task_group *tg = cgroup_tg(cgrp); +	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + +	cb->fill(cb, "nr_periods", cfs_b->nr_periods); +	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); +	cb->fill(cb, "throttled_time", cfs_b->throttled_time); + +	return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, +				s64 val) +{ +	return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ +	return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, +		u64 rt_period_us) +{ +	return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ +	return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED +	{ +		.name = "shares", +		.read_u64 = cpu_shares_read_u64, +		.write_u64 = cpu_shares_write_u64, +	}, +#endif +#ifdef CONFIG_CFS_BANDWIDTH +	{ +		.name = "cfs_quota_us", +		.read_s64 = cpu_cfs_quota_read_s64, +		.write_s64 = cpu_cfs_quota_write_s64, +	}, +	{ +		.name = "cfs_period_us", +		.read_u64 = cpu_cfs_period_read_u64, +		.write_u64 = cpu_cfs_period_write_u64, +	}, +	{ +		.name = "stat", +		.read_map = cpu_stats_show, +	}, +#endif +#ifdef CONFIG_RT_GROUP_SCHED +	{ +		.name = "rt_runtime_us", +		.read_s64 = cpu_rt_runtime_read, +		.write_s64 = cpu_rt_runtime_write, +	}, +	{ +		.name = "rt_period_us", +		.read_u64 = cpu_rt_period_read_uint, +		.write_u64 = cpu_rt_period_write_uint, +	}, +#endif +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ +	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { +	.name		= "cpu", +	.create		= cpu_cgroup_create, +	.destroy	= cpu_cgroup_destroy, +	.can_attach	= cpu_cgroup_can_attach, +	.attach		= cpu_cgroup_attach, +	.exit		= cpu_cgroup_exit, +	.populate	= cpu_cgroup_populate, +	.subsys_id	= cpu_cgroup_subsys_id, +	.early_init	= 1, +}; + +#endif	/* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( +	struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct cpuacct *ca; + +	if (!cgrp->parent) +		return &root_cpuacct.css; + +	ca = kzalloc(sizeof(*ca), GFP_KERNEL); +	if (!ca) +		goto out; + +	ca->cpuusage = alloc_percpu(u64); +	if (!ca->cpuusage) +		goto out_free_ca; + +	ca->cpustat = alloc_percpu(struct kernel_cpustat); +	if (!ca->cpustat) +		goto out_free_cpuusage; + +	return &ca->css; + +out_free_cpuusage: +	free_percpu(ca->cpuusage); +out_free_ca: +	kfree(ca); +out: +	return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); + +	free_percpu(ca->cpustat); +	free_percpu(ca->cpuusage); +	kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +	u64 data; + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit read safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	data = *cpuusage; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	data = *cpuusage; +#endif + +	return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ +	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT +	/* +	 * Take rq->lock to make 64-bit write safe on 32-bit platforms. +	 */ +	raw_spin_lock_irq(&cpu_rq(cpu)->lock); +	*cpuusage = val; +	raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else +	*cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	u64 totalcpuusage = 0; +	int i; + +	for_each_present_cpu(i) +		totalcpuusage += cpuacct_cpuusage_read(ca, i); + +	return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, +								u64 reset) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int err = 0; +	int i; + +	if (reset) { +		err = -EINVAL; +		goto out; +	} + +	for_each_present_cpu(i) +		cpuacct_cpuusage_write(ca, i, 0); + +out: +	return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, +				   struct seq_file *m) +{ +	struct cpuacct *ca = cgroup_ca(cgroup); +	u64 percpu; +	int i; + +	for_each_present_cpu(i) { +		percpu = cpuacct_cpuusage_read(ca, i); +		seq_printf(m, "%llu ", (unsigned long long) percpu); +	} +	seq_printf(m, "\n"); +	return 0; +} + +static const char *cpuacct_stat_desc[] = { +	[CPUACCT_STAT_USER] = "user", +	[CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, +			      struct cgroup_map_cb *cb) +{ +	struct cpuacct *ca = cgroup_ca(cgrp); +	int cpu; +	s64 val = 0; + +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_USER]; +		val += kcpustat->cpustat[CPUTIME_NICE]; +	} +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + +	val = 0; +	for_each_online_cpu(cpu) { +		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); +		val += kcpustat->cpustat[CPUTIME_SYSTEM]; +		val += kcpustat->cpustat[CPUTIME_IRQ]; +		val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; +	} + +	val = cputime64_to_clock_t(val); +	cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + +	return 0; +} + +static struct cftype files[] = { +	{ +		.name = "usage", +		.read_u64 = cpuusage_read, +		.write_u64 = cpuusage_write, +	}, +	{ +		.name = "usage_percpu", +		.read_seq_string = cpuacct_percpu_seq_read, +	}, +	{ +		.name = "stat", +		.read_map = cpuacct_stats_show, +	}, +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ +	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +	struct cpuacct *ca; +	int cpu; + +	if (unlikely(!cpuacct_subsys.active)) +		return; + +	cpu = task_cpu(tsk); + +	rcu_read_lock(); + +	ca = task_ca(tsk); + +	for (; ca; ca = parent_ca(ca)) { +		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); +		*cpuusage += cputime; +	} + +	rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { +	.name = "cpuacct", +	.create = cpuacct_create, +	.destroy = cpuacct_destroy, +	.populate = cpuacct_populate, +	.subsys_id = cpuacct_subsys_id, +}; +#endif	/* CONFIG_CGROUP_CPUACCT */ | 
