diff options
Diffstat (limited to 'kernel')
34 files changed, 767 insertions, 253 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa2..353d3fe8ba3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o  obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +obj-$(CONFIG_SMP) += smp.o  ifneq ($(CONFIG_SMP),y)  obj-y += up.o  endif @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/  obj-$(CONFIG_TRACING) += trace/  obj-$(CONFIG_X86_DS) += trace/  obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/  obj-$(CONFIG_SMP) += sched_cpupri.o  obj-$(CONFIG_IRQ_WORK) += irq_work.o  obj-$(CONFIG_PERF_EVENTS) += perf_event.o @@ -121,7 +122,7 @@ $(obj)/configs.o: $(obj)/config_data.h  # config_data.h contains the same information as ikconfig.h but gzipped.  # Info from config_data can be extracted from /proc/config*  targets += config_data.gz -$(obj)/config_data.gz: .config FORCE +$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE  	$(call if_changed,gzip)  quiet_cmd_ikconfiggz = IKCFG   $@ diff --git a/kernel/exit.c b/kernel/exit.c index 676149a4ac5..f9a45ebcc7b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -69,7 +69,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)  		list_del_rcu(&p->tasks);  		list_del_init(&p->sibling); -		__get_cpu_var(process_counts)--; +		__this_cpu_dec(process_counts);  	}  	list_del_rcu(&p->thread_group);  } @@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)  	exit_fs(tsk);  	check_stack_usage();  	exit_thread(); + +	/* +	 * Flush inherited counters to the parent - before the parent +	 * gets woken up by child-exit notifications. +	 * +	 * because of cgroup mode, must be called before cgroup_exit() +	 */ +	perf_event_exit_task(tsk); +  	cgroup_exit(tsk, 1);  	if (group_dead) @@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)  	 * FIXME: do that only when needed, using sched_exit tracepoint  	 */  	flush_ptrace_hw_breakpoint(tsk); -	/* -	 * Flush inherited counters to the parent - before the parent -	 * gets woken up by child-exit notifications. -	 */ -	perf_event_exit_task(tsk);  	exit_notify(tsk, group_dead);  #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index 7d164e25b0f..d9b44f20b6b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -169,15 +169,14 @@ EXPORT_SYMBOL(free_task);  static inline void free_signal_struct(struct signal_struct *sig)  {  	taskstats_tgid_free(sig); +	sched_autogroup_exit(sig);  	kmem_cache_free(signal_cachep, sig);  }  static inline void put_signal_struct(struct signal_struct *sig)  { -	if (atomic_dec_and_test(&sig->sigcnt)) { -		sched_autogroup_exit(sig); +	if (atomic_dec_and_test(&sig->sigcnt))  		free_signal_struct(sig); -	}  }  void __put_task_struct(struct task_struct *tsk) @@ -1286,7 +1285,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  			attach_pid(p, PIDTYPE_SID, task_session(current));  			list_add_tail(&p->sibling, &p->real_parent->children);  			list_add_tail_rcu(&p->tasks, &init_task.tasks); -			__get_cpu_var(process_counts)++; +			__this_cpu_inc(process_counts);  		}  		attach_pid(p, PIDTYPE_PID, pid);  		nr_threads++; @@ -1318,7 +1317,7 @@ bad_fork_cleanup_mm:  	}  bad_fork_cleanup_signal:  	if (!(clone_flags & CLONE_THREAD)) -		put_signal_struct(p->signal); +		free_signal_struct(p->signal);  bad_fork_cleanup_sighand:  	__cleanup_sighand(p->sighand);  bad_fork_cleanup_fs: diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb..66ecd2ead21 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)  	}  	if (should_send_signal(p)) { -		if (!signal_pending(p)) -			fake_signal_wake_up(p); +		fake_signal_wake_up(p); +		/* +		 * fake_signal_wake_up() goes through p's scheduler +		 * lock and guarantees that TASK_STOPPED/TRACED -> +		 * TASK_RUNNING transition can't race with task state +		 * testing in try_to_freeze_tasks(). +		 */  	} else if (sig_only) {  		return false;  	} else { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f2429fc3438..45da2b6920a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -497,7 +497,7 @@ static inline int hrtimer_is_hres_enabled(void)   */  static inline int hrtimer_hres_active(void)  { -	return __get_cpu_var(hrtimer_bases).hres_active; +	return __this_cpu_read(hrtimer_bases.hres_active);  }  /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 91a5fa25054..0caa59f747d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -577,7 +577,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }   */  static int irq_thread(void *data)  { -	static struct sched_param param = { +	static const struct sched_param param = {  		.sched_priority = MAX_USER_RT_PRIO/2,  	};  	struct irqaction *action = data; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 90f881904bb..c58fa7da8ae 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -77,21 +77,21 @@ void __weak arch_irq_work_raise(void)   */  static void __irq_work_queue(struct irq_work *entry)  { -	struct irq_work **head, *next; +	struct irq_work *next; -	head = &get_cpu_var(irq_work_list); +	preempt_disable();  	do { -		next = *head; +		next = __this_cpu_read(irq_work_list);  		/* Can assign non-atomic because we keep the flags set. */  		entry->next = next_flags(next, IRQ_WORK_FLAGS); -	} while (cmpxchg(head, next, entry) != next); +	} while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);  	/* The list was empty, raise self-interrupt to start processing. */  	if (!irq_work_next(entry))  		arch_irq_work_raise(); -	put_cpu_var(irq_work_list); +	preempt_enable();  }  /* @@ -120,16 +120,16 @@ EXPORT_SYMBOL_GPL(irq_work_queue);   */  void irq_work_run(void)  { -	struct irq_work *list, **head; +	struct irq_work *list; -	head = &__get_cpu_var(irq_work_list); -	if (*head == NULL) +	if (this_cpu_read(irq_work_list) == NULL)  		return;  	BUG_ON(!in_irq());  	BUG_ON(!irqs_disabled()); -	list = xchg(head, NULL); +	list = this_cpu_xchg(irq_work_list, NULL); +  	while (list != NULL) {  		struct irq_work *entry = list; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7663e5df0e6..77981813a1e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -317,12 +317,12 @@ void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)  /* We have preemption disabled.. so it is safe to use __ versions */  static inline void set_kprobe_instance(struct kprobe *kp)  { -	__get_cpu_var(kprobe_instance) = kp; +	__this_cpu_write(kprobe_instance, kp);  }  static inline void reset_kprobe_instance(void)  { -	__get_cpu_var(kprobe_instance) = NULL; +	__this_cpu_write(kprobe_instance, NULL);  }  /* @@ -965,7 +965,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,  static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,  					int trapnr)  { -	struct kprobe *cur = __get_cpu_var(kprobe_instance); +	struct kprobe *cur = __this_cpu_read(kprobe_instance);  	/*  	 * if we faulted "during" the execution of a user specified @@ -980,7 +980,7 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,  static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)  { -	struct kprobe *cur = __get_cpu_var(kprobe_instance); +	struct kprobe *cur = __this_cpu_read(kprobe_instance);  	int ret = 0;  	if (cur && cur->break_handler) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 5355cfd44a3..c55afba990a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),  	wait_for_completion(&create.done);  	if (!IS_ERR(create.result)) { -		static struct sched_param param = { .sched_priority = 0 }; +		static const struct sched_param param = { .sched_priority = 0 };  		va_list args;  		va_start(args, namefmt); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4a4fc..ee74b35e528 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)  	seq_puts(m, "Latency Top version : v0.1\n");  	for (i = 0; i < MAXLR; i++) { -		if (latency_record[i].backtrace[0]) { +		struct latency_record *lr = &latency_record[i]; + +		if (lr->backtrace[0]) {  			int q; -			seq_printf(m, "%i %lu %lu ", -				latency_record[i].count, -				latency_record[i].time, -				latency_record[i].max); +			seq_printf(m, "%i %lu %lu", +				   lr->count, lr->time, lr->max);  			for (q = 0; q < LT_BACKTRACEDEPTH; q++) { -				char sym[KSYM_SYMBOL_LEN]; -				char *c; -				if (!latency_record[i].backtrace[q]) +				unsigned long bt = lr->backtrace[q]; +				if (!bt)  					break; -				if (latency_record[i].backtrace[q] == ULONG_MAX) +				if (bt == ULONG_MAX)  					break; -				sprint_symbol(sym, latency_record[i].backtrace[q]); -				c = strchr(sym, '+'); -				if (c) -					*c = 0; -				seq_printf(m, "%s ", sym); +				seq_printf(m, " %ps", (void *)bt);  			}  			seq_printf(m, "\n");  		} diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 11847bf1e8c..b782b7a79f0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,12 @@  #include <asm/irq_regs.h> +enum event_type_t { +	EVENT_FLEXIBLE = 0x1, +	EVENT_PINNED = 0x2, +	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; +  atomic_t perf_task_events __read_mostly;  static atomic_t nr_mmap_events __read_mostly;  static atomic_t nr_comm_events __read_mostly; @@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;  static atomic64_t perf_event_id; +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, +			      enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, +			     enum event_type_t event_type); +  void __weak perf_event_print_debug(void)	{ }  extern __weak const char *perf_pmu_name(void) @@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)  	return "pmu";  } +static inline u64 perf_clock(void) +{ +	return local_clock(); +} +  void perf_pmu_disable(struct pmu *pmu)  {  	int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)  	put_ctx(ctx);  } -static inline u64 perf_clock(void) -{ -	return local_clock(); -} -  /*   * Update the record of the current time in a context.   */ @@ -256,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)  	ctx->timestamp = now;  } +static u64 perf_event_time(struct perf_event *event) +{ +	struct perf_event_context *ctx = event->ctx; +	return ctx ? ctx->time : 0; +} +  /*   * Update the total_time_enabled and total_time_running fields for a event.   */ @@ -269,7 +287,7 @@ static void update_event_times(struct perf_event *event)  		return;  	if (ctx->is_active) -		run_end = ctx->time; +		run_end = perf_event_time(event);  	else  		run_end = event->tstamp_stopped; @@ -278,7 +296,7 @@ static void update_event_times(struct perf_event *event)  	if (event->state == PERF_EVENT_STATE_INACTIVE)  		run_end = event->tstamp_stopped;  	else -		run_end = ctx->time; +		run_end = perf_event_time(event);  	event->total_time_running = run_end - event->tstamp_running;  } @@ -534,6 +552,7 @@ event_sched_out(struct perf_event *event,  		  struct perf_cpu_context *cpuctx,  		  struct perf_event_context *ctx)  { +	u64 tstamp = perf_event_time(event);  	u64 delta;  	/*  	 * An event which could not be activated because of @@ -545,7 +564,7 @@ event_sched_out(struct perf_event *event,  	    && !event_filter_match(event)) {  		delta = ctx->time - event->tstamp_stopped;  		event->tstamp_running += delta; -		event->tstamp_stopped = ctx->time; +		event->tstamp_stopped = tstamp;  	}  	if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -556,7 +575,7 @@ event_sched_out(struct perf_event *event,  		event->pending_disable = 0;  		event->state = PERF_EVENT_STATE_OFF;  	} -	event->tstamp_stopped = ctx->time; +	event->tstamp_stopped = tstamp;  	event->pmu->del(event, 0);  	event->oncpu = -1; @@ -768,6 +787,8 @@ event_sched_in(struct perf_event *event,  		 struct perf_cpu_context *cpuctx,  		 struct perf_event_context *ctx)  { +	u64 tstamp = perf_event_time(event); +  	if (event->state <= PERF_EVENT_STATE_OFF)  		return 0; @@ -784,9 +805,9 @@ event_sched_in(struct perf_event *event,  		return -EAGAIN;  	} -	event->tstamp_running += ctx->time - event->tstamp_stopped; +	event->tstamp_running += tstamp - event->tstamp_stopped; -	event->shadow_ctx_time = ctx->time - ctx->timestamp; +	event->shadow_ctx_time = tstamp - ctx->timestamp;  	if (!is_software_event(event))  		cpuctx->active_oncpu++; @@ -898,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,  static void add_event_to_ctx(struct perf_event *event,  			       struct perf_event_context *ctx)  { +	u64 tstamp = perf_event_time(event); +  	list_add_event(event, ctx);  	perf_group_attach(event); -	event->tstamp_enabled = ctx->time; -	event->tstamp_running = ctx->time; -	event->tstamp_stopped = ctx->time; +	event->tstamp_enabled = tstamp; +	event->tstamp_running = tstamp; +	event->tstamp_stopped = tstamp;  }  /* @@ -937,7 +960,7 @@ static void __perf_install_in_context(void *info)  	add_event_to_ctx(event, ctx); -	if (event->cpu != -1 && event->cpu != smp_processor_id()) +	if (!event_filter_match(event))  		goto unlock;  	/* @@ -1042,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,  					struct perf_event_context *ctx)  {  	struct perf_event *sub; +	u64 tstamp = perf_event_time(event);  	event->state = PERF_EVENT_STATE_INACTIVE; -	event->tstamp_enabled = ctx->time - event->total_time_enabled; +	event->tstamp_enabled = tstamp - event->total_time_enabled;  	list_for_each_entry(sub, &event->sibling_list, group_entry) { -		if (sub->state >= PERF_EVENT_STATE_INACTIVE) { -			sub->tstamp_enabled = -				ctx->time - sub->total_time_enabled; -		} +		if (sub->state >= PERF_EVENT_STATE_INACTIVE) +			sub->tstamp_enabled = tstamp - sub->total_time_enabled;  	}  } @@ -1082,7 +1104,7 @@ static void __perf_event_enable(void *info)  		goto unlock;  	__perf_event_mark_enabled(event, ctx); -	if (event->cpu != -1 && event->cpu != smp_processor_id()) +	if (!event_filter_match(event))  		goto unlock;  	/* @@ -1193,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)  	return 0;  } -enum event_type_t { -	EVENT_FLEXIBLE = 0x1, -	EVENT_PINNED = 0x2, -	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; -  static void ctx_sched_out(struct perf_event_context *ctx,  			  struct perf_cpu_context *cpuctx,  			  enum event_type_t event_type) @@ -1435,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,  	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {  		if (event->state <= PERF_EVENT_STATE_OFF)  			continue; -		if (event->cpu != -1 && event->cpu != smp_processor_id()) +		if (!event_filter_match(event))  			continue;  		if (group_can_go_on(event, cpuctx, 1)) @@ -1467,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,  		 * Listen to the 'cpu' scheduling filter constraint  		 * of events:  		 */ -		if (event->cpu != -1 && event->cpu != smp_processor_id()) +		if (!event_filter_match(event))  			continue;  		if (group_can_go_on(event, cpuctx, can_add_hw)) { @@ -1694,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)  		if (event->state != PERF_EVENT_STATE_ACTIVE)  			continue; -		if (event->cpu != -1 && event->cpu != smp_processor_id()) +		if (!event_filter_match(event))  			continue;  		hwc = &event->hw; @@ -3893,7 +3909,7 @@ static int perf_event_task_match(struct perf_event *event)  	if (event->state < PERF_EVENT_STATE_INACTIVE)  		return 0; -	if (event->cpu != -1 && event->cpu != smp_processor_id()) +	if (!event_filter_match(event))  		return 0;  	if (event->attr.comm || event->attr.mmap || @@ -4030,7 +4046,7 @@ static int perf_event_comm_match(struct perf_event *event)  	if (event->state < PERF_EVENT_STATE_INACTIVE)  		return 0; -	if (event->cpu != -1 && event->cpu != smp_processor_id()) +	if (!event_filter_match(event))  		return 0;  	if (event->attr.comm) @@ -4178,7 +4194,7 @@ static int perf_event_mmap_match(struct perf_event *event,  	if (event->state < PERF_EVENT_STATE_INACTIVE)  		return 0; -	if (event->cpu != -1 && event->cpu != smp_processor_id()) +	if (!event_filter_match(event))  		return 0;  	if ((!executable && event->attr.mmap_data) || diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185..b75597235d8 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,4 @@ - -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS	+=	-DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG)	:=	-DDEBUG  obj-$(CONFIG_PM)		+= main.o  obj-$(CONFIG_PM_SLEEP)		+= console.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b51483..870f72bc72a 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -62,7 +62,7 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)  {  	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot  	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore -	    && ops->restore_cleanup)) { +	    && ops->restore_cleanup && ops->leave)) {  		WARN_ON(1);  		return;  	} @@ -278,7 +278,7 @@ static int create_image(int platform_mode)  		goto Enable_irqs;  	} -	if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) +	if (hibernation_test(TEST_CORE) || pm_wakeup_pending())  		goto Power_up;  	in_suspend = 1; @@ -516,7 +516,7 @@ int hibernation_platform_enter(void)  	local_irq_disable();  	sysdev_suspend(PMSG_HIBERNATE); -	if (!pm_check_wakeup_events()) { +	if (pm_wakeup_pending()) {  		error = -EAGAIN;  		goto Power_up;  	} @@ -647,6 +647,7 @@ int hibernate(void)  		swsusp_free();  		if (!error)  			power_down(); +		in_suspend = 0;  		pm_restore_gfp_mask();  	} else {  		pr_debug("PM: Image restored successfully.\n"); diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0..d6d2a10320e 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)  			 * perturb a task in TASK_STOPPED or TASK_TRACED.  			 * It is "frozen enough".  If the task does wake  			 * up, it will immediately call try_to_freeze. +			 * +			 * Because freeze_task() goes through p's +			 * scheduler lock after setting TIF_FREEZE, it's +			 * guaranteed that either we see TASK_RUNNING or +			 * try_to_stop() after schedule() in ptrace/signal +			 * stop sees TIF_FREEZE.  			 */  			if (!task_is_stopped_or_traced(p) &&  			    !freezer_should_skip(p)) @@ -79,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)  		if (!todo || time_after(jiffies, end_time))  			break; -		if (!pm_check_wakeup_events()) { +		if (pm_wakeup_pending()) {  			wakeup = true;  			break;  		} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 031d5e3a619..8850df68794 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -164,7 +164,7 @@ static int suspend_enter(suspend_state_t state)  	error = sysdev_suspend(PMSG_SUSPEND);  	if (!error) { -		if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { +		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {  			error = suspend_ops->enter(state);  			events_check_enabled = false;  		} diff --git a/kernel/printk.c b/kernel/printk.c index 4642a5c439e..53d9a9ec88e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -39,6 +39,7 @@  #include <linux/syslog.h>  #include <linux/cpu.h>  #include <linux/notifier.h> +#include <linux/rculist.h>  #include <asm/uaccess.h> @@ -273,12 +274,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  	 * at open time.  	 */  	if (type == SYSLOG_ACTION_OPEN || !from_file) { -		if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) -			return -EPERM; +		if (dmesg_restrict && !capable(CAP_SYSLOG)) +			goto warn; /* switch to return -EPERM after 2.6.39 */  		if ((type != SYSLOG_ACTION_READ_ALL &&  		     type != SYSLOG_ACTION_SIZE_BUFFER) && -		    !capable(CAP_SYS_ADMIN)) -			return -EPERM; +		    !capable(CAP_SYSLOG)) +			goto warn; /* switch to return -EPERM after 2.6.39 */  	}  	error = security_syslog(type); @@ -422,6 +423,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  	}  out:  	return error; +warn: +	/* remove after 2.6.39 */ +	if (capable(CAP_SYS_ADMIN)) +		WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " +		  "but no CAP_SYSLOG (deprecated and denied).\n"); +	return -EPERM;  }  SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) @@ -1496,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)  	/* Don't allow registering multiple times */  	if (!dumper->registered) {  		dumper->registered = 1; -		list_add_tail(&dumper->list, &dump_list); +		list_add_tail_rcu(&dumper->list, &dump_list);  		err = 0;  	}  	spin_unlock_irqrestore(&dump_list_lock, flags); @@ -1520,29 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)  	spin_lock_irqsave(&dump_list_lock, flags);  	if (dumper->registered) {  		dumper->registered = 0; -		list_del(&dumper->list); +		list_del_rcu(&dumper->list);  		err = 0;  	}  	spin_unlock_irqrestore(&dump_list_lock, flags); +	synchronize_rcu();  	return err;  }  EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char * const kmsg_reasons[] = { -	[KMSG_DUMP_OOPS]	= "oops", -	[KMSG_DUMP_PANIC]	= "panic", -	[KMSG_DUMP_KEXEC]	= "kexec", -}; - -static const char *kmsg_to_str(enum kmsg_dump_reason reason) -{ -	if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) -		return "unknown"; - -	return kmsg_reasons[reason]; -} -  /**   * kmsg_dump - dump kernel log to kernel message dumpers.   * @reason: the reason (oops, panic etc) for dumping @@ -1581,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)  		l2 = chars;  	} -	if (!spin_trylock_irqsave(&dump_list_lock, flags)) { -		printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", -				kmsg_to_str(reason)); -		return; -	} -	list_for_each_entry(dumper, &dump_list, list) +	rcu_read_lock(); +	list_for_each_entry_rcu(dumper, &dump_list, list)  		dumper->dump(dumper, reason, s1, l1, s2, l2); -	spin_unlock_irqrestore(&dump_list_lock, flags); +	rcu_read_unlock();  }  #endif diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d0ddfea6579..dd4aea806f8 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -364,8 +364,8 @@ void rcu_irq_exit(void)  	WARN_ON_ONCE(rdtp->dynticks & 0x1);  	/* If the interrupt queued a callback, get out of dyntick mode. */ -	if (__get_cpu_var(rcu_sched_data).nxtlist || -	    __get_cpu_var(rcu_bh_data).nxtlist) +	if (__this_cpu_read(rcu_sched_data.nxtlist) || +	    __this_cpu_read(rcu_bh_data.nxtlist))  		set_need_resched();  } diff --git a/kernel/sched.c b/kernel/sched.c index 04949089e76..a0eb0941fa8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -278,14 +278,12 @@ struct task_group {  #endif  }; -#define root_task_group init_task_group -  /* task_group_lock serializes the addition/removal of task groups */  static DEFINE_SPINLOCK(task_group_lock);  #ifdef CONFIG_FAIR_GROUP_SCHED -# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD +# define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD  /*   * A weight of 0 or 1 can cause arithmetics problems. @@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock);  #define MIN_SHARES	2  #define MAX_SHARES	(1UL << 18) -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +static int root_task_group_load = ROOT_TASK_GROUP_LOAD;  #endif  /* Default task group.   *	Every task in system belong to this group at bootup.   */ -struct task_group init_task_group; +struct task_group root_task_group;  #endif	/* CONFIG_CGROUP_SCHED */ @@ -743,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  	buf[cnt] = 0;  	cmp = strstrip(buf); -	if (strncmp(buf, "NO_", 3) == 0) { +	if (strncmp(cmp, "NO_", 3) == 0) {  		neg = 1;  		cmp += 3;  	} @@ -7848,7 +7846,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  	cfs_rq->tg = tg;  	tg->se[cpu] = se; -	/* se could be NULL for init_task_group */ +	/* se could be NULL for root_task_group */  	if (!se)  		return; @@ -7908,18 +7906,18 @@ void __init sched_init(void)  		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);  #ifdef CONFIG_FAIR_GROUP_SCHED -		init_task_group.se = (struct sched_entity **)ptr; +		root_task_group.se = (struct sched_entity **)ptr;  		ptr += nr_cpu_ids * sizeof(void **); -		init_task_group.cfs_rq = (struct cfs_rq **)ptr; +		root_task_group.cfs_rq = (struct cfs_rq **)ptr;  		ptr += nr_cpu_ids * sizeof(void **);  #endif /* CONFIG_FAIR_GROUP_SCHED */  #ifdef CONFIG_RT_GROUP_SCHED -		init_task_group.rt_se = (struct sched_rt_entity **)ptr; +		root_task_group.rt_se = (struct sched_rt_entity **)ptr;  		ptr += nr_cpu_ids * sizeof(void **); -		init_task_group.rt_rq = (struct rt_rq **)ptr; +		root_task_group.rt_rq = (struct rt_rq **)ptr;  		ptr += nr_cpu_ids * sizeof(void **);  #endif /* CONFIG_RT_GROUP_SCHED */ @@ -7939,13 +7937,13 @@ void __init sched_init(void)  			global_rt_period(), global_rt_runtime());  #ifdef CONFIG_RT_GROUP_SCHED -	init_rt_bandwidth(&init_task_group.rt_bandwidth, +	init_rt_bandwidth(&root_task_group.rt_bandwidth,  			global_rt_period(), global_rt_runtime());  #endif /* CONFIG_RT_GROUP_SCHED */  #ifdef CONFIG_CGROUP_SCHED -	list_add(&init_task_group.list, &task_groups); -	INIT_LIST_HEAD(&init_task_group.children); +	list_add(&root_task_group.list, &task_groups); +	INIT_LIST_HEAD(&root_task_group.children);  	autogroup_init(&init_task);  #endif /* CONFIG_CGROUP_SCHED */ @@ -7960,34 +7958,34 @@ void __init sched_init(void)  		init_cfs_rq(&rq->cfs, rq);  		init_rt_rq(&rq->rt, rq);  #ifdef CONFIG_FAIR_GROUP_SCHED -		init_task_group.shares = init_task_group_load; +		root_task_group.shares = root_task_group_load;  		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);  		/* -		 * How much cpu bandwidth does init_task_group get? +		 * How much cpu bandwidth does root_task_group get?  		 *  		 * In case of task-groups formed thr' the cgroup filesystem, it  		 * gets 100% of the cpu resources in the system. This overall  		 * system cpu resource is divided among the tasks of -		 * init_task_group and its child task-groups in a fair manner, +		 * root_task_group and its child task-groups in a fair manner,  		 * based on each entity's (task or task-group's) weight  		 * (se->load.weight).  		 * -		 * In other words, if init_task_group has 10 tasks of weight +		 * In other words, if root_task_group has 10 tasks of weight  		 * 1024) and two child groups A0 and A1 (of weight 1024 each),  		 * then A0's share of the cpu resource is:  		 *  		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%  		 * -		 * We achieve this by letting init_task_group's tasks sit -		 * directly in rq->cfs (i.e init_task_group->se[] = NULL). +		 * We achieve this by letting root_task_group's tasks sit +		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).  		 */ -		init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); +		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);  #endif /* CONFIG_FAIR_GROUP_SCHED */  		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;  #ifdef CONFIG_RT_GROUP_SCHED  		INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -		init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); +		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);  #endif  		for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -8379,6 +8377,7 @@ static void free_sched_group(struct task_group *tg)  {  	free_fair_sched_group(tg);  	free_rt_sched_group(tg); +	autogroup_free(tg);  	kfree(tg);  } @@ -8812,7 +8811,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)  	if (!cgrp->parent) {  		/* This is early initialization for the top cgroup */ -		return &init_task_group.css; +		return &root_task_group.css;  	}  	parent = cgroup_tg(cgrp->parent); diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index c80fedcd476..32a723b8f84 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -9,10 +9,10 @@ unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;  static struct autogroup autogroup_default;  static atomic_t autogroup_seq_nr; -static void autogroup_init(struct task_struct *init_task) +static void __init autogroup_init(struct task_struct *init_task)  { -	autogroup_default.tg = &init_task_group; -	init_task_group.autogroup = &autogroup_default; +	autogroup_default.tg = &root_task_group; +	root_task_group.autogroup = &autogroup_default;  	kref_init(&autogroup_default.kref);  	init_rwsem(&autogroup_default.lock);  	init_task->signal->autogroup = &autogroup_default; @@ -63,7 +63,7 @@ static inline struct autogroup *autogroup_create(void)  	if (!ag)  		goto out_fail; -	tg = sched_create_group(&init_task_group); +	tg = sched_create_group(&root_task_group);  	if (IS_ERR(tg))  		goto out_free; diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2..4ec30e06998 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@  #include <linux/smp.h>  #include <linux/cpu.h> +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS  static struct {  	struct list_head	queue;  	raw_spinlock_t		lock; @@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void)  {  	raw_spin_unlock_irq(&call_function.lock);  } +#endif /* USE_GENERIC_SMP_HELPERS */ + +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int wait) +{ +	int ret = 0; + +	preempt_disable(); +	ret = smp_call_function(func, info, wait); +	local_irq_disable(); +	func(info); +	local_irq_enable(); +	preempt_enable(); +	return ret; +} +EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index d4d918a9188..68eb5efec38 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -70,7 +70,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {  static void wakeup_softirqd(void)  {  	/* Interrupts are disabled: no need to stop preemption */ -	struct task_struct *tsk = __get_cpu_var(ksoftirqd); +	struct task_struct *tsk = __this_cpu_read(ksoftirqd);  	if (tsk && tsk->state != TASK_RUNNING)  		wake_up_process(tsk); @@ -388,8 +388,8 @@ void __tasklet_schedule(struct tasklet_struct *t)  	local_irq_save(flags);  	t->next = NULL; -	*__get_cpu_var(tasklet_vec).tail = t; -	__get_cpu_var(tasklet_vec).tail = &(t->next); +	*__this_cpu_read(tasklet_vec.tail) = t; +	__this_cpu_write(tasklet_vec.tail, &(t->next));  	raise_softirq_irqoff(TASKLET_SOFTIRQ);  	local_irq_restore(flags);  } @@ -402,8 +402,8 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)  	local_irq_save(flags);  	t->next = NULL; -	*__get_cpu_var(tasklet_hi_vec).tail = t; -	__get_cpu_var(tasklet_hi_vec).tail = &(t->next); +	*__this_cpu_read(tasklet_hi_vec.tail) = t; +	__this_cpu_write(tasklet_hi_vec.tail,  &(t->next));  	raise_softirq_irqoff(HI_SOFTIRQ);  	local_irq_restore(flags);  } @@ -414,8 +414,8 @@ void __tasklet_hi_schedule_first(struct tasklet_struct *t)  {  	BUG_ON(!irqs_disabled()); -	t->next = __get_cpu_var(tasklet_hi_vec).head; -	__get_cpu_var(tasklet_hi_vec).head = t; +	t->next = __this_cpu_read(tasklet_hi_vec.head); +	__this_cpu_write(tasklet_hi_vec.head, t);  	__raise_softirq_irqoff(HI_SOFTIRQ);  } @@ -426,9 +426,9 @@ static void tasklet_action(struct softirq_action *a)  	struct tasklet_struct *list;  	local_irq_disable(); -	list = __get_cpu_var(tasklet_vec).head; -	__get_cpu_var(tasklet_vec).head = NULL; -	__get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; +	list = __this_cpu_read(tasklet_vec.head); +	__this_cpu_write(tasklet_vec.head, NULL); +	__this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);  	local_irq_enable();  	while (list) { @@ -449,8 +449,8 @@ static void tasklet_action(struct softirq_action *a)  		local_irq_disable();  		t->next = NULL; -		*__get_cpu_var(tasklet_vec).tail = t; -		__get_cpu_var(tasklet_vec).tail = &(t->next); +		*__this_cpu_read(tasklet_vec.tail) = t; +		__this_cpu_write(tasklet_vec.tail, &(t->next));  		__raise_softirq_irqoff(TASKLET_SOFTIRQ);  		local_irq_enable();  	} @@ -461,9 +461,9 @@ static void tasklet_hi_action(struct softirq_action *a)  	struct tasklet_struct *list;  	local_irq_disable(); -	list = __get_cpu_var(tasklet_hi_vec).head; -	__get_cpu_var(tasklet_hi_vec).head = NULL; -	__get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; +	list = __this_cpu_read(tasklet_hi_vec.head); +	__this_cpu_write(tasklet_hi_vec.head, NULL); +	__this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);  	local_irq_enable();  	while (list) { @@ -484,8 +484,8 @@ static void tasklet_hi_action(struct softirq_action *a)  		local_irq_disable();  		t->next = NULL; -		*__get_cpu_var(tasklet_hi_vec).tail = t; -		__get_cpu_var(tasklet_hi_vec).tail = &(t->next); +		*__this_cpu_read(tasklet_hi_vec.tail) = t; +		__this_cpu_write(tasklet_hi_vec.tail, &(t->next));  		__raise_softirq_irqoff(HI_SOFTIRQ);  		local_irq_enable();  	} @@ -802,16 +802,16 @@ static void takeover_tasklets(unsigned int cpu)  	/* Find end, append list for that CPU. */  	if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { -		*(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; -		__get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; +		*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; +		this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);  		per_cpu(tasklet_vec, cpu).head = NULL;  		per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;  	}  	raise_softirq_irqoff(TASKLET_SOFTIRQ);  	if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { -		*__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; -		__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; +		*__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; +		__this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);  		per_cpu(tasklet_hi_vec, cpu).head = NULL;  		per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;  	} @@ -853,7 +853,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,  			     cpumask_any(cpu_online_mask));  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: { -		static struct sched_param param = { +		static const struct sched_param param = {  			.sched_priority = MAX_RT_PRIO-1  		}; @@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void)  }  early_initcall(spawn_ksoftirqd); -#ifdef CONFIG_SMP -/* - * Call a function on all processors - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ -	int ret = 0; - -	preempt_disable(); -	ret = smp_call_function(func, info, wait); -	local_irq_disable(); -	func(info); -	local_irq_enable(); -	preempt_enable(); -	return ret; -} -EXPORT_SYMBOL(on_each_cpu); -#endif -  /*   * [ These __weak aliases are kept in a separate compilation unit, so that   *   GCC does not inline them incorrectly. ] diff --git a/kernel/sys.c b/kernel/sys.c index 2745dcdb6c6..31b71a276b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -43,6 +43,8 @@  #include <linux/kprobes.h>  #include <linux/user_namespace.h> +#include <linux/kmsg_dump.h> +  #include <asm/uaccess.h>  #include <asm/io.h>  #include <asm/unistd.h> @@ -285,6 +287,7 @@ out_unlock:   */  void emergency_restart(void)  { +	kmsg_dump(KMSG_DUMP_EMERG);  	machine_emergency_restart();  }  EXPORT_SYMBOL_GPL(emergency_restart); @@ -312,6 +315,7 @@ void kernel_restart(char *cmd)  		printk(KERN_EMERG "Restarting system.\n");  	else  		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); +	kmsg_dump(KMSG_DUMP_RESTART);  	machine_restart(cmd);  }  EXPORT_SYMBOL_GPL(kernel_restart); @@ -333,6 +337,7 @@ void kernel_halt(void)  	kernel_shutdown_prepare(SYSTEM_HALT);  	sysdev_shutdown();  	printk(KERN_EMERG "System halted.\n"); +	kmsg_dump(KMSG_DUMP_HALT);  	machine_halt();  } @@ -351,6 +356,7 @@ void kernel_power_off(void)  	disable_nonboot_cpus();  	sysdev_shutdown();  	printk(KERN_EMERG "Power down.\n"); +	kmsg_dump(KMSG_DUMP_POWEROFF);  	machine_power_off();  }  EXPORT_SYMBOL_GPL(kernel_power_off); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1e3ce..bc86bb32e12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -24,6 +24,7 @@  #include <linux/slab.h>  #include <linux/sysctl.h>  #include <linux/signal.h> +#include <linux/printk.h>  #include <linux/proc_fs.h>  #include <linux/security.h>  #include <linux/ctype.h> @@ -245,10 +246,6 @@ static struct ctl_table root_table[] = {  		.mode		= 0555,  		.child		= dev_table,  	}, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */  	{ }  }; @@ -710,6 +707,15 @@ static struct ctl_table kern_table[] = {  		.extra1		= &zero,  		.extra2		= &one,  	}, +	{ +		.procname	= "kptr_restrict", +		.data		= &kptr_restrict, +		.maxlen		= sizeof(int), +		.mode		= 0644, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero, +		.extra2		= &two, +	},  #endif  	{  		.procname	= "ngroups_max", @@ -962,10 +968,6 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= proc_dointvec,  	},  #endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */  	{ }  }; @@ -1326,11 +1328,6 @@ static struct ctl_table vm_table[] = {  		.extra2		= &one,  	},  #endif - -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */  	{ }  }; @@ -1486,10 +1483,6 @@ static struct ctl_table fs_table[] = {  		.proc_handler	= &pipe_proc_fn,  		.extra1		= &pipe_min_size,  	}, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */  	{ }  }; @@ -2899,7 +2892,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,  	}  } -#else /* CONFIG_PROC_FS */ +#else /* CONFIG_PROC_SYSCTL */  int proc_dostring(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2951,7 +2944,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,  } -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_PROC_SYSCTL */  /*   * No sense putting this after each symbol definition, twice, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3308fd7f1b5..3971c6b9d58 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -89,8 +89,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,  		return -ENOMEM;  	if (!info) { -		int seq = get_cpu_var(taskstats_seqnum)++; -		put_cpu_var(taskstats_seqnum); +		int seq = this_cpu_inc_return(taskstats_seqnum) - 1;  		reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);  	} else @@ -349,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)  	return ret;  } -#ifdef CONFIG_IA64 +#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)  #define TASKSTATS_NEEDS_PADDING 1  #endif @@ -612,7 +611,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)  		fill_tgid_exit(tsk);  	} -	listeners = &__raw_get_cpu_var(listener_array); +	listeners = __this_cpu_ptr(&listener_array);  	if (list_empty(&listeners->list))  		return; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538..5c00242fa92 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,6 +14,7 @@  #include <linux/timex.h>  #include <linux/time.h>  #include <linux/mm.h> +#include <linux/module.h>  /*   * NTP timekeeping variables: @@ -74,6 +75,162 @@ static long			time_adjust;  /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/  static s64			ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID	10	/* PPS signal watchdog max (s) */ +#define PPS_POPCORN	4	/* popcorn spike threshold (shift) */ +#define PPS_INTMIN	2	/* min freq interval (s) (shift) */ +#define PPS_INTMAX	8	/* max freq interval (s) (shift) */ +#define PPS_INTCOUNT	4	/* number of consecutive good intervals to +				   increase pps_shift or consecutive bad +				   intervals to decrease it */ +#define PPS_MAXWANDER	100000	/* max PPS freq wander (ns/s) */ + +static int pps_valid;		/* signal watchdog counter */ +static long pps_tf[3];		/* phase median filter */ +static long pps_jitter;		/* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift;		/* current interval duration (s) (shift) */ +static int pps_intcnt;		/* interval counter */ +static s64 pps_freq;		/* frequency offset (scaled ns/s) */ +static long pps_stabil;		/* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt;		/* calibration intervals */ +static long pps_jitcnt;		/* jitter limit exceeded */ +static long pps_stbcnt;		/* stability limit exceeded */ +static long pps_errcnt;		/* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ +	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) +		return offset; +	else +		return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ +	/* the PPS calibration interval may end +	   surprisingly early */ +	pps_shift = PPS_INTMIN; +	pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_clear(void) +{ +	pps_reset_freq_interval(); +	pps_tf[0] = 0; +	pps_tf[1] = 0; +	pps_tf[2] = 0; +	pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; +	pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_dec_valid(void) +{ +	if (pps_valid > 0) +		pps_valid--; +	else { +		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | +				 STA_PPSWANDER | STA_PPSERROR); +		pps_clear(); +	} +} + +static inline void pps_set_freq(s64 freq) +{ +	pps_freq = freq; +} + +static inline int is_error_status(int status) +{ +	return (time_status & (STA_UNSYNC|STA_CLOCKERR)) +		/* PPS signal lost when either PPS time or +		 * PPS frequency synchronization requested +		 */ +		|| ((time_status & (STA_PPSFREQ|STA_PPSTIME)) +			&& !(time_status & STA_PPSSIGNAL)) +		/* PPS jitter exceeded when +		 * PPS time synchronization requested */ +		|| ((time_status & (STA_PPSTIME|STA_PPSJITTER)) +			== (STA_PPSTIME|STA_PPSJITTER)) +		/* PPS wander exceeded or calibration error when +		 * PPS frequency synchronization requested +		 */ +		|| ((time_status & STA_PPSFREQ) +			&& (time_status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ +	txc->ppsfreq	   = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * +					 PPM_SCALE_INV, NTP_SCALE_SHIFT); +	txc->jitter	   = pps_jitter; +	if (!(time_status & STA_NANO)) +		txc->jitter /= NSEC_PER_USEC; +	txc->shift	   = pps_shift; +	txc->stabil	   = pps_stabil; +	txc->jitcnt	   = pps_jitcnt; +	txc->calcnt	   = pps_calcnt; +	txc->errcnt	   = pps_errcnt; +	txc->stbcnt	   = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ +	return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ +	return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ +	/* PPS is not implemented, so these are zero */ +	txc->ppsfreq	   = 0; +	txc->jitter	   = 0; +	txc->shift	   = 0; +	txc->stabil	   = 0; +	txc->jitcnt	   = 0; +	txc->calcnt	   = 0; +	txc->errcnt	   = 0; +	txc->stbcnt	   = 0; +} + +#endif /* CONFIG_NTP_PPS */ +  /*   * NTP methods:   */ @@ -185,6 +342,9 @@ void ntp_clear(void)  	tick_length	= tick_length_base;  	time_offset	= 0; + +	/* Clear PPS state variables */ +	pps_clear();  }  /* @@ -250,16 +410,16 @@ void second_overflow(void)  		time_status |= STA_UNSYNC;  	} -	/* -	 * Compute the phase adjustment for the next second. The offset is -	 * reduced by a fixed factor times the time constant. -	 */ +	/* Compute the phase adjustment for the next second */  	tick_length	 = tick_length_base; -	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant); +	delta		 = ntp_offset_chunk(time_offset);  	time_offset	-= delta;  	tick_length	+= delta; +	/* Check PPS signal */ +	pps_dec_valid(); +  	if (!time_adjust)  		return; @@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)  	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {  		time_state = TIME_OK;  		time_status = STA_UNSYNC; +		/* restart PPS frequency calibration */ +		pps_reset_freq_interval();  	}  	/* @@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  		time_freq = txc->freq * PPM_SCALE;  		time_freq = min(time_freq, MAXFREQ_SCALED);  		time_freq = max(time_freq, -MAXFREQ_SCALED); +		/* update pps_freq */ +		pps_set_freq(time_freq);  	}  	if (txc->modes & ADJ_MAXERROR) @@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)  	}  	result = time_state;	/* mostly `TIME_OK' */ -	if (time_status & (STA_UNSYNC|STA_CLOCKERR)) +	/* check for errors */ +	if (is_error_status(time_status))  		result = TIME_ERROR;  	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)  	txc->tick	   = tick_usec;  	txc->tai	   = time_tai; -	/* PPS is not implemented, so these are zero */ -	txc->ppsfreq	   = 0; -	txc->jitter	   = 0; -	txc->shift	   = 0; -	txc->stabil	   = 0; -	txc->jitcnt	   = 0; -	txc->calcnt	   = 0; -	txc->errcnt	   = 0; -	txc->stbcnt	   = 0; +	/* fill PPS status fields */ +	pps_fill_timex(txc);  	write_sequnlock_irq(&xtime_lock); @@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)  	return result;  } +#ifdef	CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { +	__kernel_time_t	sec;	/* seconds */ +	long		nsec;	/* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the +   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ +	struct pps_normtime norm = { +		.sec = ts.tv_sec, +		.nsec = ts.tv_nsec +	}; + +	if (norm.nsec > (NSEC_PER_SEC >> 1)) { +		norm.nsec -= NSEC_PER_SEC; +		norm.sec++; +	} + +	return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ +	*jitter = pps_tf[0] - pps_tf[1]; +	if (*jitter < 0) +		*jitter = -*jitter; + +	/* TODO: test various filters */ +	return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ +	pps_tf[2] = pps_tf[1]; +	pps_tf[1] = pps_tf[0]; +	pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ +	if (--pps_intcnt <= -PPS_INTCOUNT) { +		pps_intcnt = -PPS_INTCOUNT; +		if (pps_shift > PPS_INTMIN) { +			pps_shift--; +			pps_intcnt = 0; +		} +	} +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ +	if (++pps_intcnt >= PPS_INTCOUNT) { +		pps_intcnt = PPS_INTCOUNT; +		if (pps_shift < PPS_INTMAX) { +			pps_shift++; +			pps_intcnt = 0; +		} +	} +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ +	long delta, delta_mod; +	s64 ftemp; + +	/* check if the frequency interval was too long */ +	if (freq_norm.sec > (2 << pps_shift)) { +		time_status |= STA_PPSERROR; +		pps_errcnt++; +		pps_dec_freq_interval(); +		pr_err("hardpps: PPSERROR: interval too long - %ld s\n", +				freq_norm.sec); +		return 0; +	} + +	/* here the raw frequency offset and wander (stability) is +	 * calculated. If the wander is less than the wander threshold +	 * the interval is increased; otherwise it is decreased. +	 */ +	ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, +			freq_norm.sec); +	delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); +	pps_freq = ftemp; +	if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { +		pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); +		time_status |= STA_PPSWANDER; +		pps_stbcnt++; +		pps_dec_freq_interval(); +	} else {	/* good sample */ +		pps_inc_freq_interval(); +	} + +	/* the stability metric is calculated as the average of recent +	 * frequency changes, but is used only for performance +	 * monitoring +	 */ +	delta_mod = delta; +	if (delta_mod < 0) +		delta_mod = -delta_mod; +	pps_stabil += (div_s64(((s64)delta_mod) << +				(NTP_SCALE_SHIFT - SHIFT_USEC), +				NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + +	/* if enabled, the system clock frequency is updated */ +	if ((time_status & STA_PPSFREQ) != 0 && +	    (time_status & STA_FREQHOLD) == 0) { +		time_freq = pps_freq; +		ntp_update_frequency(); +	} + +	return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ +	long correction = -error; +	long jitter; + +	/* add the sample to the median filter */ +	pps_phase_filter_add(correction); +	correction = pps_phase_filter_get(&jitter); + +	/* Nominal jitter is due to PPS signal noise. If it exceeds the +	 * threshold, the sample is discarded; otherwise, if so enabled, +	 * the time offset is updated. +	 */ +	if (jitter > (pps_jitter << PPS_POPCORN)) { +		pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", +		       jitter, (pps_jitter << PPS_POPCORN)); +		time_status |= STA_PPSJITTER; +		pps_jitcnt++; +	} else if (time_status & STA_PPSTIME) { +		/* correct the time using the phase offset */ +		time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, +				NTP_INTERVAL_FREQ); +		/* cancel running adjtime() */ +		time_adjust = 0; +	} +	/* update jitter */ +	pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ +	struct pps_normtime pts_norm, freq_norm; +	unsigned long flags; + +	pts_norm = pps_normalize_ts(*phase_ts); + +	write_seqlock_irqsave(&xtime_lock, flags); + +	/* clear the error bits, they will be set again if needed */ +	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + +	/* indicate signal presence */ +	time_status |= STA_PPSSIGNAL; +	pps_valid = PPS_VALID; + +	/* when called for the first time, +	 * just start the frequency interval */ +	if (unlikely(pps_fbase.tv_sec == 0)) { +		pps_fbase = *raw_ts; +		write_sequnlock_irqrestore(&xtime_lock, flags); +		return; +	} + +	/* ok, now we have a base for frequency calculation */ +	freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + +	/* check that the signal is in the range +	 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ +	if ((freq_norm.sec == 0) || +			(freq_norm.nsec > MAXFREQ * freq_norm.sec) || +			(freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { +		time_status |= STA_PPSJITTER; +		/* restart the frequency calibration interval */ +		pps_fbase = *raw_ts; +		write_sequnlock_irqrestore(&xtime_lock, flags); +		pr_err("hardpps: PPSJITTER: bad pulse\n"); +		return; +	} + +	/* signal is ok */ + +	/* check if the current frequency interval is finished */ +	if (freq_norm.sec >= (1 << pps_shift)) { +		pps_calcnt++; +		/* restart the frequency calibration interval */ +		pps_fbase = *raw_ts; +		hardpps_update_freq(freq_norm); +	} + +	hardpps_update_phase(pts_norm.nsec); + +	write_sequnlock_irqrestore(&xtime_lock, flags); +} +EXPORT_SYMBOL(hardpps); + +#endif	/* CONFIG_NTP_PPS */ +  static int __init ntp_tick_adj_setup(char *str)  {  	ntp_tick_adj = simple_strtol(str, NULL, 0); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b6b898d2eee..051bc80a0c4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -49,7 +49,7 @@ struct tick_device *tick_get_device(int cpu)   */  int tick_is_oneshot_available(void)  { -	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; +	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);  	return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);  } diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index aada0e52680..5cbc101f908 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -95,7 +95,7 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,   */  int tick_program_event(ktime_t expires, int force)  { -	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; +	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);  	return tick_dev_program_event(dev, expires, force);  } @@ -167,7 +167,7 @@ int tick_oneshot_mode_active(void)  	int ret;  	local_irq_save(flags); -	ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT; +	ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT;  	local_irq_restore(flags);  	return ret; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da8200..5536aaf3ba3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)  }  EXPORT_SYMBOL_GPL(ktime_get_ts); +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw:	pointer to the timespec to be set to raw monotonic time + * @ts_real:	pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ +	unsigned long seq; +	s64 nsecs_raw, nsecs_real; + +	WARN_ON_ONCE(timekeeping_suspended); + +	do { +		u32 arch_offset; + +		seq = read_seqbegin(&xtime_lock); + +		*ts_raw = raw_time; +		*ts_real = xtime; + +		nsecs_raw = timekeeping_get_ns_raw(); +		nsecs_real = timekeeping_get_ns(); + +		/* If arch requires, add in gettimeoffset() */ +		arch_offset = arch_gettimeoffset(); +		nsecs_raw += arch_offset; +		nsecs_real += arch_offset; + +	} while (read_seqretry(&xtime_lock, seq)); + +	timespec_add_ns(ts_raw, nsecs_raw); +	timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ +  /**   * do_gettimeofday - Returns the time of day in a timeval   * @tv:		pointer to the timeval to be set diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b2..761c510a06c 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o  endif  obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o  obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o  ifeq ($(CONFIG_TRACING),y)  obj-$(CONFIG_KGDB_KDB) += trace_kdb.o  endif diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad4..dc53ecb8058 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	__this_cpu_inc(user_stack_count); - -  	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,  					  sizeof(*entry), flags, pc);  	if (!event) -		return; +		goto out_drop_count;  	entry	= ring_buffer_event_data(event);  	entry->tgid		= current->tgid; @@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	if (!filter_check_discard(call, entry, buffer, event))  		ring_buffer_unlock_commit(buffer, event); + out_drop_count:  	__this_cpu_dec(user_stack_count); -   out:  	preempt_enable();  } diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 562c56e048f..659732eba07 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)  static int trace_wakeup_test_thread(void *data)  {  	/* Make this a RT thread, doesn't need to be too high */ -	static struct sched_param param = { .sched_priority = 5 }; +	static const struct sched_param param = { .sched_priority = 5 };  	struct completion *x = data;  	sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291..9da289c34f2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -12,6 +12,8 @@  #include <linux/highuid.h>  #include <linux/cred.h> +static struct kmem_cache *user_ns_cachep __read_mostly; +  /*   * Create a new user namespace, deriving the creator from the user in the   * passed credentials, and replacing that user with the new root user for the @@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)  	struct user_struct *root_user;  	int n; -	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); +	ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);  	if (!ns)  		return -ENOMEM; @@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)  	/* Alloc new root user.  */  	root_user = alloc_uid(ns, 0);  	if (!root_user) { -		kfree(ns); +		kmem_cache_free(user_ns_cachep, ns);  		return -ENOMEM;  	} @@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)  	struct user_namespace *ns =  		container_of(work, struct user_namespace, destroyer);  	free_uid(ns->creator); -	kfree(ns); +	kmem_cache_free(user_ns_cachep, ns);  }  void free_user_ns(struct kref *kref) @@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t  	/* No useful relationship so no mapping */  	return overflowgid;  } + +static __init int user_namespaces_init(void) +{ +	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); +	return 0; +} +module_init(user_namespaces_init); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e7b575ac33..d7ebdf4cea9 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -118,12 +118,12 @@ static void __touch_watchdog(void)  {  	int this_cpu = smp_processor_id(); -	__get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +	__this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu));  }  void touch_softlockup_watchdog(void)  { -	__raw_get_cpu_var(watchdog_touch_ts) = 0; +	__this_cpu_write(watchdog_touch_ts, 0);  }  EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -167,12 +167,12 @@ void touch_softlockup_watchdog_sync(void)  /* watchdog detector functions */  static int is_hardlockup(void)  { -	unsigned long hrint = __get_cpu_var(hrtimer_interrupts); +	unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -	if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) +	if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)  		return 1; -	__get_cpu_var(hrtimer_interrupts_saved) = hrint; +	__this_cpu_write(hrtimer_interrupts_saved, hrint);  	return 0;  }  #endif @@ -205,8 +205,8 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,  	/* Ensure the watchdog never gets throttled */  	event->hw.interrupts = 0; -	if (__get_cpu_var(watchdog_nmi_touch) == true) { -		__get_cpu_var(watchdog_nmi_touch) = false; +	if (__this_cpu_read(watchdog_nmi_touch) == true) { +		__this_cpu_write(watchdog_nmi_touch, false);  		return;  	} @@ -220,7 +220,7 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,  		int this_cpu = smp_processor_id();  		/* only print hardlockups once */ -		if (__get_cpu_var(hard_watchdog_warn) == true) +		if (__this_cpu_read(hard_watchdog_warn) == true)  			return;  		if (hardlockup_panic) @@ -228,16 +228,16 @@ static void watchdog_overflow_callback(struct perf_event *event, int nmi,  		else  			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); -		__get_cpu_var(hard_watchdog_warn) = true; +		__this_cpu_write(hard_watchdog_warn, true);  		return;  	} -	__get_cpu_var(hard_watchdog_warn) = false; +	__this_cpu_write(hard_watchdog_warn, false);  	return;  }  static void watchdog_interrupt_count(void)  { -	__get_cpu_var(hrtimer_interrupts)++; +	__this_cpu_inc(hrtimer_interrupts);  }  #else  static inline void watchdog_interrupt_count(void) { return; } @@ -246,7 +246,7 @@ static inline void watchdog_interrupt_count(void) { return; }  /* watchdog kicker functions */  static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  { -	unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); +	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);  	struct pt_regs *regs = get_irq_regs();  	int duration; @@ -254,18 +254,18 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  	watchdog_interrupt_count();  	/* kick the softlockup detector */ -	wake_up_process(__get_cpu_var(softlockup_watchdog)); +	wake_up_process(__this_cpu_read(softlockup_watchdog));  	/* .. and repeat */  	hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));  	if (touch_ts == 0) { -		if (unlikely(__get_cpu_var(softlockup_touch_sync))) { +		if (unlikely(__this_cpu_read(softlockup_touch_sync))) {  			/*  			 * If the time stamp was touched atomically  			 * make sure the scheduler tick is up to date.  			 */ -			__get_cpu_var(softlockup_touch_sync) = false; +			__this_cpu_write(softlockup_touch_sync, false);  			sched_clock_tick();  		}  		__touch_watchdog(); @@ -281,7 +281,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  	duration = is_softlockup(touch_ts);  	if (unlikely(duration)) {  		/* only warn once */ -		if (__get_cpu_var(soft_watchdog_warn) == true) +		if (__this_cpu_read(soft_watchdog_warn) == true)  			return HRTIMER_RESTART;  		printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", @@ -296,9 +296,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  		if (softlockup_panic)  			panic("softlockup: hung tasks"); -		__get_cpu_var(soft_watchdog_warn) = true; +		__this_cpu_write(soft_watchdog_warn, true);  	} else -		__get_cpu_var(soft_watchdog_warn) = false; +		__this_cpu_write(soft_watchdog_warn, false);  	return HRTIMER_RESTART;  } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e785b0f2aea..8ee6ec82f88 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -932,6 +932,38 @@ static void insert_work(struct cpu_workqueue_struct *cwq,  		wake_up_worker(gcwq);  } +/* + * Test whether @work is being queued from another work executing on the + * same workqueue.  This is rather expensive and should only be used from + * cold paths. + */ +static bool is_chained_work(struct workqueue_struct *wq) +{ +	unsigned long flags; +	unsigned int cpu; + +	for_each_gcwq_cpu(cpu) { +		struct global_cwq *gcwq = get_gcwq(cpu); +		struct worker *worker; +		struct hlist_node *pos; +		int i; + +		spin_lock_irqsave(&gcwq->lock, flags); +		for_each_busy_worker(worker, i, pos, gcwq) { +			if (worker->task != current) +				continue; +			spin_unlock_irqrestore(&gcwq->lock, flags); +			/* +			 * I'm @worker, no locking necessary.  See if @work +			 * is headed to the same workqueue. +			 */ +			return worker->current_cwq->wq == wq; +		} +		spin_unlock_irqrestore(&gcwq->lock, flags); +	} +	return false; +} +  static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  			 struct work_struct *work)  { @@ -943,7 +975,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	debug_work_activate(work); -	if (WARN_ON_ONCE(wq->flags & WQ_DYING)) +	/* if dying, only works from the same workqueue are allowed */ +	if (unlikely(wq->flags & WQ_DYING) && +	    WARN_ON_ONCE(!is_chained_work(wq)))  		return;  	/* determine gcwq to use */ @@ -2936,11 +2970,35 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);   */  void destroy_workqueue(struct workqueue_struct *wq)  { +	unsigned int flush_cnt = 0;  	unsigned int cpu; +	/* +	 * Mark @wq dying and drain all pending works.  Once WQ_DYING is +	 * set, only chain queueing is allowed.  IOW, only currently +	 * pending or running work items on @wq can queue further work +	 * items on it.  @wq is flushed repeatedly until it becomes empty. +	 * The number of flushing is detemined by the depth of chaining and +	 * should be relatively short.  Whine if it takes too long. +	 */  	wq->flags |= WQ_DYING; +reflush:  	flush_workqueue(wq); +	for_each_cwq_cpu(cpu, wq) { +		struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + +		if (!cwq->nr_active && list_empty(&cwq->delayed_works)) +			continue; + +		if (++flush_cnt == 10 || +		    (flush_cnt % 100 == 0 && flush_cnt <= 1000)) +			printk(KERN_WARNING "workqueue %s: flush on " +			       "destruction isn't complete after %u tries\n", +			       wq->name, flush_cnt); +		goto reflush; +	} +  	/*  	 * wq list is used to freeze wq, remove from list after  	 * flushing is complete in case freeze races us.  |