diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup.c | 13 | ||||
| -rw-r--r-- | kernel/events/core.c | 10 | ||||
| -rw-r--r-- | kernel/exit.c | 19 | ||||
| -rw-r--r-- | kernel/panic.c | 6 | ||||
| -rw-r--r-- | kernel/pid_namespace.c | 20 | ||||
| -rw-r--r-- | kernel/printk.c | 241 | ||||
| -rw-r--r-- | kernel/rcutree.c | 2 | ||||
| -rw-r--r-- | kernel/rcutree.h | 14 | ||||
| -rw-r--r-- | kernel/rcutree_plugin.h | 165 | ||||
| -rw-r--r-- | kernel/sys.c | 6 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 7 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 2 | ||||
| -rw-r--r-- | kernel/watchdog.c | 19 | 
13 files changed, 393 insertions, 131 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd3069a9..2097684cf19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void)  EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ +	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} +  /* the current nr of refs, always >= 0 whether @css is deactivated or not */  static int css_refcnt(struct cgroup_subsys_state *css)  {  	int v = atomic_read(&css->refcnt); -	return v >= 0 ? v : v - CSS_DEACT_BIAS; +	return css_unbias_refcnt(v);  }  /* convenient tests for these bits */ @@ -4982,10 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget);  void __css_put(struct cgroup_subsys_state *css)  {  	struct cgroup *cgrp = css->cgroup; +	int v;  	rcu_read_lock(); -	atomic_dec(&css->refcnt); -	switch (css_refcnt(css)) { +	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + +	switch (v) {  	case 1:  		if (notify_on_release(cgrp)) {  			set_bit(CGRP_RELEASABLE, &cgrp->flags); diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c0154b33..d7d71d6ec97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event)  	return !event->cgrp || event->cgrp == cpuctx->cgrp;  } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event)  { -	css_get(&event->cgrp->css); +	return css_tryget(&event->cgrp->css);  }  static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  	event->cgrp = cgrp;  	/* must be done before we fput() the file */ -	perf_get_cgroup(event); +	if (!perf_tryget_cgroup(event)) { +		event->cgrp = NULL; +		ret = -ENOENT; +		goto out; +	}  	/*  	 * all events in a group must monitor diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42..2f59cc33451 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,6 +72,18 @@ static void __unhash_process(struct task_struct *p, bool group_dead)  		list_del_rcu(&p->tasks);  		list_del_init(&p->sibling);  		__this_cpu_dec(process_counts); +		/* +		 * If we are the last child process in a pid namespace to be +		 * reaped, notify the reaper sleeping zap_pid_ns_processes(). +		 */ +		if (IS_ENABLED(CONFIG_PID_NS)) { +			struct task_struct *parent = p->real_parent; + +			if ((task_active_pid_ns(parent)->child_reaper == parent) && +			    list_empty(&parent->children) && +			    (parent->flags & PF_EXITING)) +				wake_up_process(parent); +		}  	}  	list_del_rcu(&p->thread_group);  } @@ -643,6 +655,7 @@ static void exit_mm(struct task_struct * tsk)  	mm_release(tsk, mm);  	if (!mm)  		return; +	sync_mm_rss(mm);  	/*  	 * Serialize with any possible pending coredump.  	 * We must hold mmap_sem around checking core_state @@ -719,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)  		zap_pid_ns_processes(pid_ns);  		write_lock_irq(&tasklist_lock); -		/* -		 * We can not clear ->child_reaper or leave it alone. -		 * There may by stealth EXIT_DEAD tasks on ->children, -		 * forget_original_parent() must move them somewhere. -		 */ -		pid_ns->child_reaper = init_pid_ns.child_reaper;  	} else if (father->signal->has_child_subreaper) {  		struct task_struct *reaper; diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d7..d2a5f4ecc6d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@  #define PANIC_TIMER_STEP 100  #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;  static unsigned long tainted_mask;  static int pause_on_oops;  static int pause_on_oops_flag; @@ -108,8 +108,6 @@ void panic(const char *fmt, ...)  	 */  	crash_kexec(NULL); -	kmsg_dump(KMSG_DUMP_PANIC); -  	/*  	 * Note smp_send_stop is the usual smp shutdown function, which  	 * unfortunately means it may not be hardened to work in a panic @@ -117,6 +115,8 @@ void panic(const char *fmt, ...)  	 */  	smp_send_stop(); +	kmsg_dump(KMSG_DUMP_PANIC); +  	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);  	bust_spinlocks(0); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a..b3c7fd55425 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)  	}  	read_unlock(&tasklist_lock); +	/* Firstly reap the EXIT_ZOMBIE children we may have. */  	do {  		clear_thread_flag(TIF_SIGPENDING);  		rc = sys_wait4(-1, NULL, __WALL, NULL);  	} while (rc != -ECHILD); +	/* +	 * sys_wait4() above can't reap the TASK_DEAD children. +	 * Make sure they all go away, see __unhash_process(). +	 */ +	for (;;) { +		bool need_wait = false; + +		read_lock(&tasklist_lock); +		if (!list_empty(¤t->children)) { +			__set_current_state(TASK_UNINTERRUPTIBLE); +			need_wait = true; +		} +		read_unlock(&tasklist_lock); + +		if (!need_wait) +			break; +		schedule(); +	} +  	if (pid_ns->reboot)  		current->signal->group_exit_code = pid_ns->reboot; diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364..a2276b91676 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -227,10 +227,10 @@ static u32 clear_idx;  #define LOG_LINE_MAX 1024  /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)  #define LOG_ALIGN 4  #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log)  #endif  #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)  static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -414,7 +414,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	if (!user)  		return -EBADF; -	mutex_lock(&user->lock); +	ret = mutex_lock_interruptible(&user->lock); +	if (ret) +		return ret;  	raw_spin_lock(&logbuf_lock);  	while (user->seq == log_next_seq) {  		if (file->f_flags & O_NONBLOCK) { @@ -878,7 +880,9 @@ static int syslog_print(char __user *buf, int size)  	syslog_seq++;  	raw_spin_unlock_irq(&logbuf_lock); -	if (len > 0 && copy_to_user(buf, text, len)) +	if (len > size) +		len = -EINVAL; +	else if (len > 0 && copy_to_user(buf, text, len))  		len = -EFAULT;  	kfree(text); @@ -909,7 +913,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		/*  		 * Find first record that fits, including all following records,  		 * into the user-provided buffer for this dump. -		*/ +		 */  		seq = clear_seq;  		idx = clear_idx;  		while (seq < log_next_seq) { @@ -919,6 +923,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  			idx = log_next(idx);  			seq++;  		} + +		/* move first record forward until length fits into the buffer */  		seq = clear_seq;  		idx = clear_idx;  		while (len > size && seq < log_next_seq) { @@ -929,7 +935,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  			seq++;  		} -		/* last message in this dump */ +		/* last message fitting into this dump */  		next_seq = log_next_seq;  		len = 0; @@ -974,6 +980,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  {  	bool clear = false;  	static int saved_console_loglevel = -1; +	static DEFINE_MUTEX(syslog_mutex);  	int error;  	error = check_syslog_permissions(type, from_file); @@ -1000,11 +1007,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			error = -EFAULT;  			goto out;  		} +		error = mutex_lock_interruptible(&syslog_mutex); +		if (error) +			goto out;  		error = wait_event_interruptible(log_wait,  						 syslog_seq != log_next_seq); -		if (error) +		if (error) { +			mutex_unlock(&syslog_mutex);  			goto out; +		}  		error = syslog_print(buf, len); +		mutex_unlock(&syslog_mutex);  		break;  	/* Read/clear last kernel messages */  	case SYSLOG_ACTION_READ_CLEAR: @@ -2300,48 +2313,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);   * kmsg_dump - dump kernel log to kernel message dumpers.   * @reason: the reason (oops, panic etc) for dumping   * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer().   */  void kmsg_dump(enum kmsg_dump_reason reason)  { -	u64 idx;  	struct kmsg_dumper *dumper; -	const char *s1, *s2; -	unsigned long l1, l2;  	unsigned long flags;  	if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)  		return; -	/* Theoretically, the log could move on after we do this, but -	   there's not a lot we can do about that. The new messages -	   will overwrite the start of what we dump. */ +	rcu_read_lock(); +	list_for_each_entry_rcu(dumper, &dump_list, list) { +		if (dumper->max_reason && reason > dumper->max_reason) +			continue; + +		/* initialize iterator with data about the stored records */ +		dumper->active = true; + +		raw_spin_lock_irqsave(&logbuf_lock, flags); +		dumper->cur_seq = clear_seq; +		dumper->cur_idx = clear_idx; +		dumper->next_seq = log_next_seq; +		dumper->next_idx = log_next_idx; +		raw_spin_unlock_irqrestore(&logbuf_lock, flags); + +		/* invoke dumper which will iterate over records */ +		dumper->dump(dumper, reason); + +		/* reset iterator */ +		dumper->active = false; +	} +	rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, +			char *line, size_t size, size_t *len) +{ +	unsigned long flags; +	struct log *msg; +	size_t l = 0; +	bool ret = false; + +	if (!dumper->active) +		goto out;  	raw_spin_lock_irqsave(&logbuf_lock, flags); -	if (syslog_seq < log_first_seq) -		idx = syslog_idx; -	else -		idx = log_first_idx; +	if (dumper->cur_seq < log_first_seq) { +		/* messages are gone, move to first available one */ +		dumper->cur_seq = log_first_seq; +		dumper->cur_idx = log_first_idx; +	} + +	/* last entry */ +	if (dumper->cur_seq >= log_next_seq) { +		raw_spin_unlock_irqrestore(&logbuf_lock, flags); +		goto out; +	} -	if (idx > log_next_idx) { -		s1 = log_buf; -		l1 = log_next_idx; +	msg = log_from_idx(dumper->cur_idx); +	l = msg_print_text(msg, syslog, +			      line, size); -		s2 = log_buf + idx; -		l2 = log_buf_len - idx; -	} else { -		s1 = ""; -		l1 = 0; +	dumper->cur_idx = log_next(dumper->cur_idx); +	dumper->cur_seq++; +	ret = true; +	raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: +	if (len) +		*len = l; +	return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +			  char *buf, size_t size, size_t *len) +{ +	unsigned long flags; +	u64 seq; +	u32 idx; +	u64 next_seq; +	u32 next_idx; +	size_t l = 0; +	bool ret = false; + +	if (!dumper->active) +		goto out; + +	raw_spin_lock_irqsave(&logbuf_lock, flags); +	if (dumper->cur_seq < log_first_seq) { +		/* messages are gone, move to first available one */ +		dumper->cur_seq = log_first_seq; +		dumper->cur_idx = log_first_idx; +	} + +	/* last entry */ +	if (dumper->cur_seq >= dumper->next_seq) { +		raw_spin_unlock_irqrestore(&logbuf_lock, flags); +		goto out; +	} + +	/* calculate length of entire buffer */ +	seq = dumper->cur_seq; +	idx = dumper->cur_idx; +	while (seq < dumper->next_seq) { +		struct log *msg = log_from_idx(idx); + +		l += msg_print_text(msg, true, NULL, 0); +		idx = log_next(idx); +		seq++; +	} -		s2 = log_buf + idx; -		l2 = log_next_idx - idx; +	/* move first record forward until length fits into the buffer */ +	seq = dumper->cur_seq; +	idx = dumper->cur_idx; +	while (l > size && seq < dumper->next_seq) { +		struct log *msg = log_from_idx(idx); + +		l -= msg_print_text(msg, true, NULL, 0); +		idx = log_next(idx); +		seq++; +	} + +	/* last message in next interation */ +	next_seq = seq; +	next_idx = idx; + +	l = 0; +	while (seq < dumper->next_seq) { +		struct log *msg = log_from_idx(idx); + +		l += msg_print_text(msg, syslog, +				    buf + l, size - l); + +		idx = log_next(idx); +		seq++;  	} + +	dumper->next_seq = next_seq; +	dumper->next_idx = next_idx; +	ret = true;  	raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: +	if (len) +		*len = l; +	return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); -	rcu_read_lock(); -	list_for_each_entry_rcu(dumper, &dump_list, list) -		dumper->dump(dumper, reason, s1, l1, s2, l2); -	rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&logbuf_lock, flags); +	dumper->cur_seq = clear_seq; +	dumper->cur_idx = clear_idx; +	dumper->next_seq = log_next_seq; +	dumper->next_idx = log_next_idx; +	raw_spin_unlock_irqrestore(&logbuf_lock, flags);  } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind);  #endif diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d..3b0f1337f75 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)  	rdp->qlen_lazy += rsp->qlen_lazy;  	rdp->qlen += rsp->qlen;  	rdp->n_cbs_adopted += rsp->qlen; +	if (rsp->qlen_lazy != rsp->qlen) +		rcu_idle_count_callbacks_posted();  	rsp->qlen_lazy = 0;  	rsp->qlen = 0; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf..ea056495783 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,6 +84,20 @@ struct rcu_dynticks {  				    /* Process level is worth LLONG_MAX/2. */  	int dynticks_nmi_nesting;   /* Track NMI nesting level. */  	atomic_t dynticks;	    /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ +	int dyntick_drain;	    /* Prepare-for-idle state variable. */ +	unsigned long dyntick_holdoff; +				    /* No retries for the jiffy of failure. */ +	struct timer_list idle_gp_timer; +				    /* Wake up CPU sleeping with callbacks. */ +	unsigned long idle_gp_timer_expires; +				    /* When to wake up CPU (for repost). */ +	bool idle_first_pass;	    /* First pass of attempt to go idle? */ +	unsigned long nonlazy_posted; +				    /* # times non-lazy CBs posted to CPU. */ +	unsigned long nonlazy_posted_snap; +				    /* idle-period nonlazy_posted snapshot. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */  };  /* RCU's kthread states for tracing. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d986..5271a020887 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)   * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs   * any flavor of RCU.   */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)  { +	*delta_jiffies = ULONG_MAX;  	return rcu_cpu_has_callbacks(cpu);  } @@ -1962,41 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void)  #define RCU_IDLE_GP_DELAY 6		/* Roughly one grace period. */  #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it.  After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ -	/* Flag a new idle sojourn to the idle-entry state machine. */ -	per_cpu(rcu_idle_first_pass, cpu) = 1; -	/* If no callbacks, RCU doesn't need the CPU. */ -	if (!rcu_cpu_has_callbacks(cpu)) -		return 0; -	/* Otherwise, RCU needs the CPU only if it recently tried and failed. */ -	return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; -} -  /*   * Does the specified flavor of RCU have non-lazy callbacks pending on   * the specified CPU?  Both RCU flavor and CPU are specified by the @@ -2040,6 +2006,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu)  }  /* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it.  After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks.  The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU.  This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ +	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + +	/* Flag a new idle sojourn to the idle-entry state machine. */ +	rdtp->idle_first_pass = 1; +	/* If no callbacks, RCU doesn't need the CPU. */ +	if (!rcu_cpu_has_callbacks(cpu)) { +		*delta_jiffies = ULONG_MAX; +		return 0; +	} +	if (rdtp->dyntick_holdoff == jiffies) { +		/* RCU recently tried and failed, so don't try again. */ +		*delta_jiffies = 1; +		return 1; +	} +	/* Set up for the possibility that RCU will post a timer. */ +	if (rcu_cpu_has_nonlazy_callbacks(cpu)) +		*delta_jiffies = RCU_IDLE_GP_DELAY; +	else +		*delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; +	return 0; +} + +/*   * Handler for smp_call_function_single().  The only point of this   * handler is to wake the CPU up, so the handler does only tracing.   */ @@ -2075,21 +2082,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in)   */  static void rcu_prepare_for_idle_init(int cpu)  { -	per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; -	setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), -		    rcu_idle_gp_timer_func, cpu); -	per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; -	per_cpu(rcu_idle_first_pass, cpu) = 1; +	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + +	rdtp->dyntick_holdoff = jiffies - 1; +	setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); +	rdtp->idle_gp_timer_expires = jiffies - 1; +	rdtp->idle_first_pass = 1;  }  /*   * Clean up for exit from idle.  Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will + * is no longer any point to ->idle_gp_timer, so cancel it.  This will   * do nothing if this timer is not active, so just cancel it unconditionally.   */  static void rcu_cleanup_after_idle(int cpu)  { -	del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); +	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + +	del_timer(&rdtp->idle_gp_timer);  	trace_rcu_prep_idle("Cleanup after idle");  } @@ -2108,42 +2118,41 @@ static void rcu_cleanup_after_idle(int cpu)   * Because it is not legal to invoke rcu_process_callbacks() with irqs   * disabled, we do one pass of force_quiescent_state(), then do a   * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later.  The ->dyntick_drain field controls the sequencing.   *   * The caller must have disabled interrupts.   */  static void rcu_prepare_for_idle(int cpu)  {  	struct timer_list *tp; +	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);  	/*  	 * If this is an idle re-entry, for example, due to use of  	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle  	 * loop, then don't take any state-machine actions, unless the  	 * momentary exit from idle queued additional non-lazy callbacks. -	 * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks +	 * Instead, repost the ->idle_gp_timer if this CPU has callbacks  	 * pending.  	 */ -	if (!per_cpu(rcu_idle_first_pass, cpu) && -	    (per_cpu(rcu_nonlazy_posted, cpu) == -	     per_cpu(rcu_nonlazy_posted_snap, cpu))) { +	if (!rdtp->idle_first_pass && +	    (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {  		if (rcu_cpu_has_callbacks(cpu)) { -			tp = &per_cpu(rcu_idle_gp_timer, cpu); -			mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); +			tp = &rdtp->idle_gp_timer; +			mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);  		}  		return;  	} -	per_cpu(rcu_idle_first_pass, cpu) = 0; -	per_cpu(rcu_nonlazy_posted_snap, cpu) = -		per_cpu(rcu_nonlazy_posted, cpu) - 1; +	rdtp->idle_first_pass = 0; +	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;  	/*  	 * If there are no callbacks on this CPU, enter dyntick-idle mode.  	 * Also reset state to avoid prejudicing later attempts.  	 */  	if (!rcu_cpu_has_callbacks(cpu)) { -		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; -		per_cpu(rcu_dyntick_drain, cpu) = 0; +		rdtp->dyntick_holdoff = jiffies - 1; +		rdtp->dyntick_drain = 0;  		trace_rcu_prep_idle("No callbacks");  		return;  	} @@ -2152,36 +2161,37 @@ static void rcu_prepare_for_idle(int cpu)  	 * If in holdoff mode, just return.  We will presumably have  	 * refrained from disabling the scheduling-clock tick.  	 */ -	if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { +	if (rdtp->dyntick_holdoff == jiffies) {  		trace_rcu_prep_idle("In holdoff");  		return;  	} -	/* Check and update the rcu_dyntick_drain sequencing. */ -	if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { +	/* Check and update the ->dyntick_drain sequencing. */ +	if (rdtp->dyntick_drain <= 0) {  		/* First time through, initialize the counter. */ -		per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; -	} else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && +		rdtp->dyntick_drain = RCU_IDLE_FLUSHES; +	} else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&  		   !rcu_pending(cpu) &&  		   !local_softirq_pending()) {  		/* Can we go dyntick-idle despite still having callbacks? */ -		trace_rcu_prep_idle("Dyntick with callbacks"); -		per_cpu(rcu_dyntick_drain, cpu) = 0; -		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; -		if (rcu_cpu_has_nonlazy_callbacks(cpu)) -			per_cpu(rcu_idle_gp_timer_expires, cpu) = +		rdtp->dyntick_drain = 0; +		rdtp->dyntick_holdoff = jiffies; +		if (rcu_cpu_has_nonlazy_callbacks(cpu)) { +			trace_rcu_prep_idle("Dyntick with callbacks"); +			rdtp->idle_gp_timer_expires =  					   jiffies + RCU_IDLE_GP_DELAY; -		else -			per_cpu(rcu_idle_gp_timer_expires, cpu) = +		} else { +			rdtp->idle_gp_timer_expires =  					   jiffies + RCU_IDLE_LAZY_GP_DELAY; -		tp = &per_cpu(rcu_idle_gp_timer, cpu); -		mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); -		per_cpu(rcu_nonlazy_posted_snap, cpu) = -			per_cpu(rcu_nonlazy_posted, cpu); +			trace_rcu_prep_idle("Dyntick with lazy callbacks"); +		} +		tp = &rdtp->idle_gp_timer; +		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); +		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;  		return; /* Nothing more to do immediately. */ -	} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { +	} else if (--(rdtp->dyntick_drain) <= 0) {  		/* We have hit the limit, so time to give up. */ -		per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; +		rdtp->dyntick_holdoff = jiffies;  		trace_rcu_prep_idle("Begin holdoff");  		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */  		return; @@ -2227,7 +2237,7 @@ static void rcu_prepare_for_idle(int cpu)   */  static void rcu_idle_count_callbacks_posted(void)  { -	__this_cpu_add(rcu_nonlazy_posted, 1); +	__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);  }  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2238,11 +2248,12 @@ static void rcu_idle_count_callbacks_posted(void)  static void print_cpu_stall_fast_no_hz(char *cp, int cpu)  { -	struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); +	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); +	struct timer_list *tltp = &rdtp->idle_gp_timer;  	sprintf(cp, "drain=%d %c timer=%lu", -		per_cpu(rcu_dyntick_drain, cpu), -		per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', +		rdtp->dyntick_drain, +		rdtp->dyntick_holdoff == jiffies ? 'H' : '.',  		timer_pending(tltp) ? tltp->expires - jiffies : -1);  } diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44dcd41..e0c8ffc50d7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  				else  					return -EINVAL;  				break; -		case PR_GET_TID_ADDRESS: -			error = prctl_get_tid_address(me, (int __user **)arg2); -			break;  			default:  				return -EINVAL;  			} @@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		case PR_SET_MM:  			error = prctl_set_mm(arg2, arg3, arg4, arg5);  			break; +		case PR_GET_TID_ADDRESS: +			error = prctl_get_tid_address(me, (int __user **)arg2); +			break;  		case PR_SET_CHILD_SUBREAPER:  			me->signal->is_child_subreaper = !!arg2;  			error = 0; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index da70c6db496..86999783392 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);  static void tick_nohz_stop_sched_tick(struct tick_sched *ts)  {  	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; +	unsigned long rcu_delta_jiffies;  	ktime_t last_update, expires, now;  	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;  	u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)  		time_delta = timekeeping_max_deferment();  	} while (read_seqretry(&xtime_lock, seq)); -	if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || +	if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) ||  	    arch_needs_cpu(cpu)) {  		next_jiffies = last_jiffies + 1;  		delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)  		/* Get the next timer wheel timer */  		next_jiffies = get_next_timer_interrupt(last_jiffies);  		delta_jiffies = next_jiffies - last_jiffies; +		if (rcu_delta_jiffies < delta_jiffies) { +			next_jiffies = last_jiffies + rcu_delta_jiffies; +			delta_jiffies = rcu_delta_jiffies; +		}  	}  	/*  	 * Do not stop the tick, if we are only one off diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177d..49249c28690 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on);  void tracing_off(void)  {  	if (global_trace.buffer) -		ring_buffer_record_on(global_trace.buffer); +		ring_buffer_record_off(global_trace.buffer);  	/*  	 * This flag is only looked at when buffers haven't been  	 * allocated yet. We don't really care about the race diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7..4b1dfba70f7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -372,6 +372,13 @@ static int watchdog(void *unused)  #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; +  static int watchdog_nmi_enable(int cpu)  {  	struct perf_event_attr *wd_attr; @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu)  	/* Try to register using hardware perf events */  	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + +	/* save cpu0 error for future comparision */ +	if (cpu == 0 && IS_ERR(event)) +		cpu0_err = PTR_ERR(event); +  	if (!IS_ERR(event)) { -		pr_info("enabled, takes one hw-pmu counter.\n"); +		/* only print for cpu0 or different than cpu0 */ +		if (cpu == 0 || cpu0_err) +			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");  		goto out_save;  	} +	/* skip displaying the same error again */ +	if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) +		return PTR_ERR(event);  	/* vary the KERN level based on the returned errno */  	if (PTR_ERR(event) == -EOPNOTSUPP)  |