diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/cgroup.c | 23 | ||||
| -rw-r--r-- | kernel/fork.c | 11 | ||||
| -rw-r--r-- | kernel/hrtimer.c | 53 | ||||
| -rw-r--r-- | kernel/printk.c | 202 | ||||
| -rw-r--r-- | kernel/rcutree.c | 1 | ||||
| -rw-r--r-- | kernel/rcutree.h | 1 | ||||
| -rw-r--r-- | kernel/rcutree_plugin.h | 14 | ||||
| -rw-r--r-- | kernel/relay.c | 5 | ||||
| -rw-r--r-- | kernel/sched/core.c | 276 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 2 | ||||
| -rw-r--r-- | kernel/sys.c | 16 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 63 | ||||
| -rw-r--r-- | kernel/trace/ring_buffer.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace.c | 6 | 
16 files changed, 478 insertions, 204 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2097684cf19..b303dfc7dce 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -901,13 +901,10 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  		mutex_unlock(&cgroup_mutex);  		/* -		 * We want to drop the active superblock reference from the -		 * cgroup creation after all the dentry refs are gone - -		 * kill_sb gets mighty unhappy otherwise.  Mark -		 * dentry->d_fsdata with cgroup_diput() to tell -		 * cgroup_d_release() to call deactivate_super(). +		 * Drop the active superblock reference that we took when we +		 * created the cgroup  		 */ -		dentry->d_fsdata = cgroup_diput; +		deactivate_super(cgrp->root->sb);  		/*  		 * if we're getting rid of the cgroup, refcount should ensure @@ -933,13 +930,6 @@ static int cgroup_delete(const struct dentry *d)  	return 1;  } -static void cgroup_d_release(struct dentry *dentry) -{ -	/* did cgroup_diput() tell me to deactivate super? */ -	if (dentry->d_fsdata == cgroup_diput) -		deactivate_super(dentry->d_sb); -} -  static void remove_dir(struct dentry *d)  {  	struct dentry *parent = dget(d->d_parent); @@ -1547,7 +1537,6 @@ static int cgroup_get_rootdir(struct super_block *sb)  	static const struct dentry_operations cgroup_dops = {  		.d_iput = cgroup_diput,  		.d_delete = cgroup_delete, -		.d_release = cgroup_d_release,  	};  	struct inode *inode = @@ -3894,8 +3883,12 @@ static void css_dput_fn(struct work_struct *work)  {  	struct cgroup_subsys_state *css =  		container_of(work, struct cgroup_subsys_state, dput_work); +	struct dentry *dentry = css->cgroup->dentry; +	struct super_block *sb = dentry->d_sb; -	dput(css->cgroup->dentry); +	atomic_inc(&sb->s_active); +	dput(dentry); +	deactivate_super(sb);  }  static void init_cgroup_css(struct cgroup_subsys_state *css, diff --git a/kernel/fork.c b/kernel/fork.c index ab5211b9e62..f00e319d837 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,12 +304,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  	}  	err = arch_dup_task_struct(tsk, orig); -	if (err) -		goto out; +	/* +	 * We defer looking at err, because we will need this setup +	 * for the clean up path to work correctly. +	 */  	tsk->stack = ti; -  	setup_thread_stack(tsk, orig); + +	if (err) +		goto out; +  	clear_user_return_notifier(tsk);  	clear_tsk_need_resched(tsk);  	stackend = end_of_stack(tsk); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682..6db7a5ed52b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,  	return 0;  } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ +	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; +	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + +	return ktime_get_update_offsets(offs_real, offs_boot); +} +  /*   * Retrigger next event is called after clock was set   * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,  static void retrigger_next_event(void *arg)  {  	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); -	struct timespec realtime_offset, xtim, wtm, sleep;  	if (!hrtimer_hres_active())  		return; -	/* Optimized out for !HIGH_RES */ -	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); -	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - -	/* Adjust CLOCK_REALTIME offset */  	raw_spin_lock(&base->lock); -	base->clock_base[HRTIMER_BASE_REALTIME].offset = -		timespec_to_ktime(realtime_offset); -	base->clock_base[HRTIMER_BASE_BOOTTIME].offset = -		timespec_to_ktime(sleep); - +	hrtimer_update_base(base);  	hrtimer_force_reprogram(base, 0);  	raw_spin_unlock(&base->lock);  } @@ -710,13 +708,25 @@ static int hrtimer_switch_to_hres(void)  		base->clock_base[i].resolution = KTIME_HIGH_RES;  	tick_setup_sched_timer(); -  	/* "Retrigger" the interrupt to get things going */  	retrigger_next_event(NULL);  	local_irq_restore(flags);  	return 1;  } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ +	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + +	cpu_base->clock_was_set = 1; +	__raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} +  #else  static inline int hrtimer_hres_active(void) { return 0; } @@ -1250,11 +1260,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)  	cpu_base->nr_events++;  	dev->next_event.tv64 = KTIME_MAX; -	entry_time = now = ktime_get(); +	raw_spin_lock(&cpu_base->lock); +	entry_time = now = hrtimer_update_base(cpu_base);  retry:  	expires_next.tv64 = KTIME_MAX; - -	raw_spin_lock(&cpu_base->lock);  	/*  	 * We set expires_next to KTIME_MAX here with cpu_base->lock  	 * held to prevent that a timer is enqueued in our queue via @@ -1330,8 +1339,12 @@ retry:  	 * We need to prevent that we loop forever in the hrtimer  	 * interrupt routine. We give it 3 attempts to avoid  	 * overreacting on some spurious event. +	 * +	 * Acquire base lock for updating the offsets and retrieving +	 * the current time.  	 */ -	now = ktime_get(); +	raw_spin_lock(&cpu_base->lock); +	now = hrtimer_update_base(cpu_base);  	cpu_base->nr_retries++;  	if (++retries < 3)  		goto retry; @@ -1343,6 +1356,7 @@ retry:  	 */  	cpu_base->nr_hangs++;  	cpu_base->hang_detected = 1; +	raw_spin_unlock(&cpu_base->lock);  	delta = ktime_sub(now, entry_time);  	if (delta.tv64 > cpu_base->max_hang_time.tv64)  		cpu_base->max_hang_time = delta; @@ -1395,6 +1409,13 @@ void hrtimer_peek_ahead_timers(void)  static void run_hrtimer_softirq(struct softirq_action *h)  { +	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + +	if (cpu_base->clock_was_set) { +		cpu_base->clock_was_set = 0; +		clock_was_set(); +	} +  	hrtimer_peek_ahead_timers();  } diff --git a/kernel/printk.c b/kernel/printk.c index dba18211685..177fa49357a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -194,8 +194,10 @@ static int console_may_schedule;   */  enum log_flags { -	LOG_DEFAULT = 0, -	LOG_NOCONS = 1,		/* already flushed, do not print to console */ +	LOG_NOCONS	= 1,	/* already flushed, do not print to console */ +	LOG_NEWLINE	= 2,	/* text ended with a newline */ +	LOG_PREFIX	= 4,	/* text started with a prefix */ +	LOG_CONT	= 8,	/* text is a fragment of a continuation line */  };  struct log { @@ -217,6 +219,8 @@ static DEFINE_RAW_SPINLOCK(logbuf_lock);  /* the next printk record to read by syslog(READ) or /proc/kmsg */  static u64 syslog_seq;  static u32 syslog_idx; +static enum log_flags syslog_prev; +static size_t syslog_partial;  /* index and sequence number of the first record stored in the buffer */  static u64 log_first_seq; @@ -430,20 +434,20 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	ret = mutex_lock_interruptible(&user->lock);  	if (ret)  		return ret; -	raw_spin_lock(&logbuf_lock); +	raw_spin_lock_irq(&logbuf_lock);  	while (user->seq == log_next_seq) {  		if (file->f_flags & O_NONBLOCK) {  			ret = -EAGAIN; -			raw_spin_unlock(&logbuf_lock); +			raw_spin_unlock_irq(&logbuf_lock);  			goto out;  		} -		raw_spin_unlock(&logbuf_lock); +		raw_spin_unlock_irq(&logbuf_lock);  		ret = wait_event_interruptible(log_wait,  					       user->seq != log_next_seq);  		if (ret)  			goto out; -		raw_spin_lock(&logbuf_lock); +		raw_spin_lock_irq(&logbuf_lock);  	}  	if (user->seq < log_first_seq) { @@ -451,7 +455,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  		user->idx = log_first_idx;  		user->seq = log_first_seq;  		ret = -EPIPE; -		raw_spin_unlock(&logbuf_lock); +		raw_spin_unlock_irq(&logbuf_lock);  		goto out;  	} @@ -465,7 +469,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	for (i = 0; i < msg->text_len; i++) {  		unsigned char c = log_text(msg)[i]; -		if (c < ' ' || c >= 128) +		if (c < ' ' || c >= 127 || c == '\\')  			len += sprintf(user->buf + len, "\\x%02x", c);  		else  			user->buf[len++] = c; @@ -489,7 +493,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  				continue;  			} -			if (c < ' ' || c >= 128) { +			if (c < ' ' || c >= 127 || c == '\\') {  				len += sprintf(user->buf + len, "\\x%02x", c);  				continue;  			} @@ -501,7 +505,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  	user->idx = log_next(user->idx);  	user->seq++; -	raw_spin_unlock(&logbuf_lock); +	raw_spin_unlock_irq(&logbuf_lock);  	if (len > count) {  		ret = -EINVAL; @@ -528,7 +532,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)  	if (offset)  		return -ESPIPE; -	raw_spin_lock(&logbuf_lock); +	raw_spin_lock_irq(&logbuf_lock);  	switch (whence) {  	case SEEK_SET:  		/* the first record */ @@ -552,7 +556,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)  	default:  		ret = -EINVAL;  	} -	raw_spin_unlock(&logbuf_lock); +	raw_spin_unlock_irq(&logbuf_lock);  	return ret;  } @@ -566,14 +570,14 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)  	poll_wait(file, &log_wait, wait); -	raw_spin_lock(&logbuf_lock); +	raw_spin_lock_irq(&logbuf_lock);  	if (user->seq < log_next_seq) {  		/* return error when data has vanished underneath us */  		if (user->seq < log_first_seq)  			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;  		ret = POLLIN|POLLRDNORM;  	} -	raw_spin_unlock(&logbuf_lock); +	raw_spin_unlock_irq(&logbuf_lock);  	return ret;  } @@ -597,10 +601,10 @@ static int devkmsg_open(struct inode *inode, struct file *file)  	mutex_init(&user->lock); -	raw_spin_lock(&logbuf_lock); +	raw_spin_lock_irq(&logbuf_lock);  	user->idx = log_first_idx;  	user->seq = log_first_seq; -	raw_spin_unlock(&logbuf_lock); +	raw_spin_unlock_irq(&logbuf_lock);  	file->private_data = user;  	return 0; @@ -818,15 +822,18 @@ static size_t print_time(u64 ts, char *buf)  static size_t print_prefix(const struct log *msg, bool syslog, char *buf)  {  	size_t len = 0; +	unsigned int prefix = (msg->facility << 3) | msg->level;  	if (syslog) {  		if (buf) { -			len += sprintf(buf, "<%u>", msg->level); +			len += sprintf(buf, "<%u>", prefix);  		} else {  			len += 3; -			if (msg->level > 9) -				len++; -			if (msg->level > 99) +			if (prefix > 999) +				len += 3; +			else if (prefix > 99) +				len += 2; +			else if (prefix > 9)  				len++;  		}  	} @@ -835,13 +842,26 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)  	return len;  } -static size_t msg_print_text(const struct log *msg, bool syslog, -			     char *buf, size_t size) +static size_t msg_print_text(const struct log *msg, enum log_flags prev, +			     bool syslog, char *buf, size_t size)  {  	const char *text = log_text(msg);  	size_t text_size = msg->text_len; +	bool prefix = true; +	bool newline = true;  	size_t len = 0; +	if ((prev & LOG_CONT) && !(msg->flags & LOG_PREFIX)) +		prefix = false; + +	if (msg->flags & LOG_CONT) { +		if ((prev & LOG_CONT) && !(prev & LOG_NEWLINE)) +			prefix = false; + +		if (!(msg->flags & LOG_NEWLINE)) +			newline = false; +	} +  	do {  		const char *next = memchr(text, '\n', text_size);  		size_t text_len; @@ -859,16 +879,22 @@ static size_t msg_print_text(const struct log *msg, bool syslog,  			    text_len + 1>= size - len)  				break; -			len += print_prefix(msg, syslog, buf + len); +			if (prefix) +				len += print_prefix(msg, syslog, buf + len);  			memcpy(buf + len, text, text_len);  			len += text_len; -			buf[len++] = '\n'; +			if (next || newline) +				buf[len++] = '\n';  		} else {  			/* SYSLOG_ACTION_* buffer size only calculation */ -			len += print_prefix(msg, syslog, NULL); -			len += text_len + 1; +			if (prefix) +				len += print_prefix(msg, syslog, NULL); +			len += text_len; +			if (next || newline) +				len++;  		} +		prefix = true;  		text = next;  	} while (text); @@ -887,22 +913,35 @@ static int syslog_print(char __user *buf, int size)  	while (size > 0) {  		size_t n; +		size_t skip;  		raw_spin_lock_irq(&logbuf_lock);  		if (syslog_seq < log_first_seq) {  			/* messages are gone, move to first one */  			syslog_seq = log_first_seq;  			syslog_idx = log_first_idx; +			syslog_prev = 0; +			syslog_partial = 0;  		}  		if (syslog_seq == log_next_seq) {  			raw_spin_unlock_irq(&logbuf_lock);  			break;  		} + +		skip = syslog_partial;  		msg = log_from_idx(syslog_idx); -		n = msg_print_text(msg, true, text, LOG_LINE_MAX); -		if (n <= size) { +		n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); +		if (n - syslog_partial <= size) { +			/* message fits into buffer, move forward */  			syslog_idx = log_next(syslog_idx);  			syslog_seq++; +			syslog_prev = msg->flags; +			n -= syslog_partial; +			syslog_partial = 0; +		} else if (!len){ +			/* partial read(), remember position */ +			n = size; +			syslog_partial += n;  		} else  			n = 0;  		raw_spin_unlock_irq(&logbuf_lock); @@ -910,17 +949,15 @@ static int syslog_print(char __user *buf, int size)  		if (!n)  			break; -		len += n; -		size -= n; -		buf += n; -		n = copy_to_user(buf - n, text, n); - -		if (n) { -			len -= n; +		if (copy_to_user(buf, text + skip, n)) {  			if (!len)  				len = -EFAULT;  			break;  		} + +		len += n; +		size -= n; +		buf += n;  	}  	kfree(text); @@ -941,6 +978,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		u64 next_seq;  		u64 seq;  		u32 idx; +		enum log_flags prev;  		if (clear_seq < log_first_seq) {  			/* messages are gone, move to first available one */ @@ -954,10 +992,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		 */  		seq = clear_seq;  		idx = clear_idx; +		prev = 0;  		while (seq < log_next_seq) {  			struct log *msg = log_from_idx(idx); -			len += msg_print_text(msg, true, NULL, 0); +			len += msg_print_text(msg, prev, true, NULL, 0);  			idx = log_next(idx);  			seq++;  		} @@ -965,10 +1004,11 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		/* move first record forward until length fits into the buffer */  		seq = clear_seq;  		idx = clear_idx; +		prev = 0;  		while (len > size && seq < log_next_seq) {  			struct log *msg = log_from_idx(idx); -			len -= msg_print_text(msg, true, NULL, 0); +			len -= msg_print_text(msg, prev, true, NULL, 0);  			idx = log_next(idx);  			seq++;  		} @@ -977,17 +1017,19 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		next_seq = log_next_seq;  		len = 0; +		prev = 0;  		while (len >= 0 && seq < next_seq) {  			struct log *msg = log_from_idx(idx);  			int textlen; -			textlen = msg_print_text(msg, true, text, LOG_LINE_MAX); +			textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX);  			if (textlen < 0) {  				len = textlen;  				break;  			}  			idx = log_next(idx);  			seq++; +			prev = msg->flags;  			raw_spin_unlock_irq(&logbuf_lock);  			if (copy_to_user(buf + len, text, textlen)) @@ -1000,6 +1042,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  				/* messages are gone, move to next one */  				seq = log_first_seq;  				idx = log_first_idx; +				prev = 0;  			}  		}  	} @@ -1018,7 +1061,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  {  	bool clear = false;  	static int saved_console_loglevel = -1; -	static DEFINE_MUTEX(syslog_mutex);  	int error;  	error = check_syslog_permissions(type, from_file); @@ -1045,17 +1087,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			error = -EFAULT;  			goto out;  		} -		error = mutex_lock_interruptible(&syslog_mutex); -		if (error) -			goto out;  		error = wait_event_interruptible(log_wait,  						 syslog_seq != log_next_seq); -		if (error) { -			mutex_unlock(&syslog_mutex); +		if (error)  			goto out; -		}  		error = syslog_print(buf, len); -		mutex_unlock(&syslog_mutex);  		break;  	/* Read/clear last kernel messages */  	case SYSLOG_ACTION_READ_CLEAR: @@ -1111,6 +1147,8 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			/* messages are gone, move to first one */  			syslog_seq = log_first_seq;  			syslog_idx = log_first_idx; +			syslog_prev = 0; +			syslog_partial = 0;  		}  		if (from_file) {  			/* @@ -1120,19 +1158,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			 */  			error = log_next_idx - syslog_idx;  		} else { -			u64 seq; -			u32 idx; +			u64 seq = syslog_seq; +			u32 idx = syslog_idx; +			enum log_flags prev = syslog_prev;  			error = 0; -			seq = syslog_seq; -			idx = syslog_idx;  			while (seq < log_next_seq) {  				struct log *msg = log_from_idx(idx); -				error += msg_print_text(msg, true, NULL, 0); +				error += msg_print_text(msg, prev, true, NULL, 0);  				idx = log_next(idx);  				seq++; +				prev = msg->flags;  			} +			error -= syslog_partial;  		}  		raw_spin_unlock_irq(&logbuf_lock);  		break; @@ -1400,10 +1439,9 @@ asmlinkage int vprintk_emit(int facility, int level,  	static char textbuf[LOG_LINE_MAX];  	char *text = textbuf;  	size_t text_len; +	enum log_flags lflags = 0;  	unsigned long flags;  	int this_cpu; -	bool newline = false; -	bool prefix = false;  	int printed_len = 0;  	boot_delay_msec(); @@ -1442,7 +1480,7 @@ asmlinkage int vprintk_emit(int facility, int level,  		recursion_bug = 0;  		printed_len += strlen(recursion_msg);  		/* emit KERN_CRIT message */ -		log_store(0, 2, LOG_DEFAULT, 0, +		log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,  			  NULL, 0, recursion_msg, printed_len);  	} @@ -1455,7 +1493,7 @@ asmlinkage int vprintk_emit(int facility, int level,  	/* mark and strip a trailing newline */  	if (text_len && text[text_len-1] == '\n') {  		text_len--; -		newline = true; +		lflags |= LOG_NEWLINE;  	}  	/* strip syslog prefix and extract log level or control flags */ @@ -1465,7 +1503,7 @@ asmlinkage int vprintk_emit(int facility, int level,  			if (level == -1)  				level = text[1] - '0';  		case 'd':	/* KERN_DEFAULT */ -			prefix = true; +			lflags |= LOG_PREFIX;  		case 'c':	/* KERN_CONT */  			text += 3;  			text_len -= 3; @@ -1475,22 +1513,20 @@ asmlinkage int vprintk_emit(int facility, int level,  	if (level == -1)  		level = default_message_loglevel; -	if (dict) { -		prefix = true; -		newline = true; -	} +	if (dict) +		lflags |= LOG_PREFIX|LOG_NEWLINE; -	if (!newline) { +	if (!(lflags & LOG_NEWLINE)) {  		/*  		 * Flush the conflicting buffer. An earlier newline was missing,  		 * or another task also prints continuation lines.  		 */ -		if (cont.len && (prefix || cont.owner != current)) +		if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))  			cont_flush();  		/* buffer line if possible, otherwise store it right away */  		if (!cont_add(facility, level, text, text_len)) -			log_store(facility, level, LOG_DEFAULT, 0, +			log_store(facility, level, lflags | LOG_CONT, 0,  				  dict, dictlen, text, text_len);  	} else {  		bool stored = false; @@ -1502,13 +1538,13 @@ asmlinkage int vprintk_emit(int facility, int level,  		 * flush it out and store this line separately.  		 */  		if (cont.len && cont.owner == current) { -			if (!prefix) +			if (!(lflags & LOG_PREFIX))  				stored = cont_add(facility, level, text, text_len);  			cont_flush();  		}  		if (!stored) -			log_store(facility, level, LOG_DEFAULT, 0, +			log_store(facility, level, lflags, 0,  				  dict, dictlen, text, text_len);  	}  	printed_len += text_len; @@ -1607,8 +1643,8 @@ static struct cont {  static struct log *log_from_idx(u32 idx) { return NULL; }  static u32 log_next(u32 idx) { return 0; }  static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, bool syslog, -			     char *buf, size_t size) { return 0; } +static size_t msg_print_text(const struct log *msg, enum log_flags prev, +			     bool syslog, char *buf, size_t size) { return 0; }  static size_t cont_print_text(char *text, size_t size) { return 0; }  #endif /* CONFIG_PRINTK */ @@ -1884,6 +1920,7 @@ void wake_up_klogd(void)  /* the next printk record to write to the console */  static u64 console_seq;  static u32 console_idx; +static enum log_flags console_prev;  /**   * console_unlock - unlock the console system @@ -1944,6 +1981,7 @@ again:  			/* messages are gone, move to first one */  			console_seq = log_first_seq;  			console_idx = log_first_idx; +			console_prev = 0;  		}  skip:  		if (console_seq == log_next_seq) @@ -1957,14 +1995,21 @@ skip:  			 */  			console_idx = log_next(console_idx);  			console_seq++; +			/* +			 * We will get here again when we register a new +			 * CON_PRINTBUFFER console. Clear the flag so we +			 * will properly dump everything later. +			 */ +			msg->flags &= ~LOG_NOCONS;  			goto skip;  		}  		level = msg->level; -		len = msg_print_text(msg, false, text, sizeof(text)); - +		len = msg_print_text(msg, console_prev, false, +				     text, sizeof(text));  		console_idx = log_next(console_idx);  		console_seq++; +		console_prev = msg->flags;  		raw_spin_unlock(&logbuf_lock);  		stop_critical_timings();	/* don't trace print latency */ @@ -2227,6 +2272,7 @@ void register_console(struct console *newcon)  		raw_spin_lock_irqsave(&logbuf_lock, flags);  		console_seq = syslog_seq;  		console_idx = syslog_idx; +		console_prev = syslog_prev;  		raw_spin_unlock_irqrestore(&logbuf_lock, flags);  		/*  		 * We're about to replay the log buffer.  Only do this to the @@ -2520,8 +2566,7 @@ bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,  	}  	msg = log_from_idx(dumper->cur_idx); -	l = msg_print_text(msg, syslog, -			      line, size); +	l = msg_print_text(msg, 0, syslog, line, size);  	dumper->cur_idx = log_next(dumper->cur_idx);  	dumper->cur_seq++; @@ -2561,6 +2606,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	u32 idx;  	u64 next_seq;  	u32 next_idx; +	enum log_flags prev;  	size_t l = 0;  	bool ret = false; @@ -2583,23 +2629,27 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	/* calculate length of entire buffer */  	seq = dumper->cur_seq;  	idx = dumper->cur_idx; +	prev = 0;  	while (seq < dumper->next_seq) {  		struct log *msg = log_from_idx(idx); -		l += msg_print_text(msg, true, NULL, 0); +		l += msg_print_text(msg, prev, true, NULL, 0);  		idx = log_next(idx);  		seq++; +		prev = msg->flags;  	}  	/* move first record forward until length fits into the buffer */  	seq = dumper->cur_seq;  	idx = dumper->cur_idx; +	prev = 0;  	while (l > size && seq < dumper->next_seq) {  		struct log *msg = log_from_idx(idx); -		l -= msg_print_text(msg, true, NULL, 0); +		l -= msg_print_text(msg, prev, true, NULL, 0);  		idx = log_next(idx);  		seq++; +		prev = msg->flags;  	}  	/* last message in next interation */ @@ -2607,14 +2657,14 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	next_idx = idx;  	l = 0; +	prev = 0;  	while (seq < dumper->next_seq) {  		struct log *msg = log_from_idx(idx); -		l += msg_print_text(msg, syslog, -				    buf + l, size - l); - +		l += msg_print_text(msg, prev, syslog, buf + l, size - l);  		idx = log_next(idx);  		seq++; +		prev = msg->flags;  	}  	dumper->next_seq = next_seq; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55..4b97bba7396 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu)  {  	trace_rcu_utilization("Start context switch");  	rcu_sched_qs(cpu); +	rcu_preempt_note_context_switch(cpu);  	trace_rcu_utilization("End context switch");  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783..19b61ac1079 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);  /* Forward declarations for rcutree_plugin.h */  static void rcu_bootup_announce(void);  long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu);  static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);  #ifdef CONFIG_HOTPLUG_CPU  static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887..3e4899459f3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu)   *   * Caller must disable preemption.   */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu)  {  	struct task_struct *t = current;  	unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void)  	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {  		/* Possibly blocking in an RCU read-side critical section. */ -		rdp = __this_cpu_ptr(rcu_preempt_state.rda); +		rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);  		rnp = rdp->mynode;  		raw_spin_lock_irqsave(&rnp->lock, flags);  		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void)  	 * means that we continue to block the current grace period.  	 */  	local_irq_save(flags); -	rcu_preempt_qs(smp_processor_id()); +	rcu_preempt_qs(cpu);  	local_irq_restore(flags);  } @@ -1002,6 +1002,14 @@ void rcu_force_quiescent_state(void)  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);  /* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/*   * Because preemptible RCU does not exist, there are never any preempted   * RCU readers.   */ diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4..e8cd2027abb 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,  	struct splice_pipe_desc spd = {  		.pages = pages,  		.nr_pages = 0, +		.nr_pages_max = PIPE_DEF_BUFFERS,  		.partial = partial,  		.flags = flags,  		.ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,                  ret += padding;  out: -	splice_shrink_spd(pipe, &spd); -        return ret; +	splice_shrink_spd(&spd); +	return ret;  }  static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d..468bdd44c1b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,  #endif  	/* Here we just switch the register state and the stack. */ -	rcu_switch_from(prev);  	switch_to(prev, next, prev);  	barrier(); @@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)  } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + *   nr_active = 0; + *   for_each_possible_cpu(cpu) + *   	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + *  - for_each_possible_cpu() is prohibitively expensive on machines with + *    serious number of cpus, therefore we need to take a distributed approach + *    to calculating nr_active. + * + *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + *    So assuming nr_active := 0 when we start out -- true per definition, we + *    can simply take per-cpu deltas and fold those into a global accumulate + *    to obtain the same result. See calc_load_fold_active(). + * + *    Furthermore, in order to avoid synchronizing all per-cpu delta folding + *    across the machine, we assume 10 ticks is sufficient time for every + *    cpu to have completed this task. + * + *    This places an upper-bound on the IRQ-off latency of the machine. Then + *    again, being late doesn't loose the delta, just wrecks the sample. + * + *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + *    this would add another cross-cpu cacheline miss and atomic operation + *    to the wakeup path. Instead we increment on whatever cpu the task ran + *    when it went into uninterruptible state and decrement on whatever cpu + *    did the wakeup. This means that only the sum of nr_uninterruptible over + *    all cpus yields the correct result. + * + *  This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ +  /* Variables and functions for calc_load */  static atomic_long_t calc_load_tasks;  static unsigned long calc_load_update;  unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads:	pointer to dest load array + * @offset:	offset to add + * @shift:	shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ +	loads[0] = (avenrun[0] + offset) << shift; +	loads[1] = (avenrun[1] + offset) << shift; +	loads[2] = (avenrun[2] + offset) << shift; +}  static long calc_load_fold_active(struct rq *this_rq)  { @@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)  	return delta;  } +/* + * a1 = a0 * e + a * (1 - e) + */  static unsigned long  calc_load(unsigned long load, unsigned long exp, unsigned long active)  { @@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)  #ifdef CONFIG_NO_HZ  /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + *  - When we go NO_HZ idle during the window, we can negate our sample + *    contribution, causing under-accounting. + * + *    We avoid this by keeping two idle-delta counters and flipping them + *    when the window starts, thus separating old and new NO_HZ load. + * + *    The only trick is the slight shift in index flip for read vs write. + * + *        0s            5s            10s           15s + *          +10           +10           +10           +10 + *        |-|-----------|-|-----------|-|-----------|-| + *    r:0 0 1           1 0           0 1           1 0 + *    w:0 1 1           0 0           1 1           0 0 + * + *    This ensures we'll fold the old idle contribution in this window while + *    accumlating the new one. + * + *  - When we wake up from NO_HZ idle during the window, we push up our + *    contribution, since we effectively move our sample point to a known + *    busy state. + * + *    This is solved by pushing the window forward, and thus skipping the + *    sample, for this cpu (effectively using the idle-delta for this cpu which + *    was in effect at the time the window opened). This also solves the issue + *    of having to deal with a cpu having been in NOHZ idle for multiple + *    LOAD_FREQ intervals.   *   * When making the ILB scale, we should try to pull this in as well.   */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void)  { +	int idx = calc_load_idx; + +	/* +	 * See calc_global_nohz(), if we observe the new index, we also +	 * need to observe the new update time. +	 */ +	smp_rmb(); + +	/* +	 * If the folding window started, make sure we start writing in the +	 * next idle-delta. +	 */ +	if (!time_before(jiffies, calc_load_update)) +		idx++; + +	return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ +	return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ +	struct rq *this_rq = this_rq();  	long delta; +	/* +	 * We're going into NOHZ mode, if there's any pending delta, fold it +	 * into the pending idle delta. +	 */  	delta = calc_load_fold_active(this_rq); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks_idle); +	if (delta) { +		int idx = calc_load_write_idx(); +		atomic_long_add(delta, &calc_load_idle[idx]); +	}  } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void)  { -	long delta = 0; +	struct rq *this_rq = this_rq(); + +	/* +	 * If we're still before the sample window, we're done. +	 */ +	if (time_before(jiffies, this_rq->calc_load_update)) +		return;  	/* -	 * Its got a race, we don't care... +	 * We woke inside or after the sample window, this means we're already +	 * accounted through the nohz accounting, so skip the entire deal and +	 * sync up for the next window.  	 */ -	if (atomic_long_read(&calc_load_tasks_idle)) -		delta = atomic_long_xchg(&calc_load_tasks_idle, 0); +	this_rq->calc_load_update = calc_load_update; +	if (time_before(jiffies, this_rq->calc_load_update + 10)) +		this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ +	int idx = calc_load_read_idx(); +	long delta = 0; + +	if (atomic_long_read(&calc_load_idle[idx])) +		delta = atomic_long_xchg(&calc_load_idle[idx], 0);  	return delta;  } @@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)  {  	long delta, active, n; -	/* -	 * If we crossed a calc_load_update boundary, make sure to fold -	 * any pending idle changes, the respective CPUs might have -	 * missed the tick driven calc_load_account_active() update -	 * due to NO_HZ. -	 */ -	delta = calc_load_fold_idle(); -	if (delta) -		atomic_long_add(delta, &calc_load_tasks); - -	/* -	 * It could be the one fold was all it took, we done! -	 */ -	if (time_before(jiffies, calc_load_update + 10)) -		return; - -	/* -	 * Catch-up, fold however many we are behind still -	 */ -	delta = jiffies - calc_load_update - 10; -	n = 1 + (delta / LOAD_FREQ); +	if (!time_before(jiffies, calc_load_update + 10)) { +		/* +		 * Catch-up, fold however many we are behind still +		 */ +		delta = jiffies - calc_load_update - 10; +		n = 1 + (delta / LOAD_FREQ); -	active = atomic_long_read(&calc_load_tasks); -	active = active > 0 ? active * FIXED_1 : 0; +		active = atomic_long_read(&calc_load_tasks); +		active = active > 0 ? active * FIXED_1 : 0; -	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); -	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); -	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); +		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); +		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); +		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); -	calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} +		calc_load_update += n * LOAD_FREQ; +	} -static inline long calc_load_fold_idle(void) -{ -	return 0; +	/* +	 * Flip the idle index... +	 * +	 * Make sure we first write the new time then flip the index, so that +	 * calc_load_write_idx() will see the new time when it reads the new +	 * index, this avoids a double flip messing things up. +	 */ +	smp_wmb(); +	calc_load_idx++;  } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads:	pointer to dest load array - * @offset:	offset to add - * @shift:	shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ -	loads[0] = (avenrun[0] + offset) << shift; -	loads[1] = (avenrun[1] + offset) << shift; -	loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */  /*   * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)   */  void calc_global_load(unsigned long ticks)  { -	long active; +	long active, delta;  	if (time_before(jiffies, calc_load_update + 10))  		return; +	/* +	 * Fold the 'old' idle-delta to include all NO_HZ cpus. +	 */ +	delta = calc_load_fold_idle(); +	if (delta) +		atomic_long_add(delta, &calc_load_tasks); +  	active = atomic_long_read(&calc_load_tasks);  	active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)  	calc_load_update += LOAD_FREQ;  	/* -	 * Account one period with whatever state we found before -	 * folding in the nohz state and ageing the entire idle period. -	 * -	 * This avoids loosing a sample when we go idle between  -	 * calc_load_account_active() (10 ticks ago) and now and thus -	 * under-accounting. +	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.  	 */  	calc_global_nohz();  } @@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)  		return;  	delta  = calc_load_fold_active(this_rq); -	delta += calc_load_fold_idle();  	if (delta)  		atomic_long_add(delta, &calc_load_tasks); @@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)  }  /* + * End of global load-average stuff + */ + +/*   * The exact cpuload at various idx values, calculated at every tick would be   * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load   * diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d..b6baf370cae 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl  static struct task_struct *pick_next_task_idle(struct rq *rq)  {  	schedstat_inc(rq, sched_goidle); -	calc_load_account_idle(rq);  	return rq->idle;  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33..55844f24435 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void)  	return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;  } -void calc_load_account_idle(struct rq *this_rq); -  #ifdef CONFIG_SCHED_HRTICK  /* diff --git a/kernel/sys.c b/kernel/sys.c index e0c8ffc50d7..2d39a84cd85 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask)  #ifdef CONFIG_CHECKPOINT_RESTORE  static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  { -	struct vm_area_struct *vma;  	struct file *exe_file;  	struct dentry *dentry;  	int err; @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  	down_write(&mm->mmap_sem);  	/* -	 * Forbid mm->exe_file change if there are mapped other files. +	 * Forbid mm->exe_file change if old file still mapped.  	 */  	err = -EBUSY; -	for (vma = mm->mmap; vma; vma = vma->vm_next) { -		if (vma->vm_file && !path_equal(&vma->vm_file->f_path, -						&exe_file->f_path)) -			goto exit_unlock; +	if (mm->exe_file) { +		struct vm_area_struct *vma; + +		for (vma = mm->mmap; vma; vma = vma->vm_next) +			if (vma->vm_file && +			    path_equal(&vma->vm_file->f_path, +				       &mm->exe_file->f_path)) +				goto exit_unlock;  	}  	/* @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))  		goto exit_unlock; +	err = 0;  	set_mm_exe_file(mm, exe_file);  exit_unlock:  	up_write(&mm->mmap_sem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 86999783392..4a08472c3ca 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts)  		 */  		if (!ts->tick_stopped) {  			select_nohz_load_balancer(1); +			calc_load_enter_idle();  			ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);  			ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void)  		account_idle_ticks(ticks);  #endif +	calc_load_exit_idle();  	touch_softlockup_watchdog();  	/*  	 * Cancel the scheduled timer and restore the tick diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6f46a00a1e8..269b1fe5f2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,12 @@ struct timekeeper {  	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */  	struct timespec raw_time; +	/* Offset clock monotonic -> clock realtime */ +	ktime_t offs_real; + +	/* Offset clock monotonic -> clock boottime */ +	ktime_t offs_boot; +  	/* Seqlock for all timekeeper values */  	seqlock_t lock;  }; @@ -172,6 +178,14 @@ static inline s64 timekeeping_get_ns_raw(void)  	return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);  } +static void update_rt_offset(void) +{ +	struct timespec tmp, *wtm = &timekeeper.wall_to_monotonic; + +	set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); +	timekeeper.offs_real = timespec_to_ktime(tmp); +} +  /* must hold write on timekeeper.lock */  static void timekeeping_update(bool clearntp)  { @@ -179,6 +193,7 @@ static void timekeeping_update(bool clearntp)  		timekeeper.ntp_error = 0;  		ntp_clear();  	} +	update_rt_offset();  	update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,  			 timekeeper.clock, timekeeper.mult);  } @@ -604,6 +619,7 @@ void __init timekeeping_init(void)  	}  	set_normalized_timespec(&timekeeper.wall_to_monotonic,  				-boot.tv_sec, -boot.tv_nsec); +	update_rt_offset();  	timekeeper.total_sleep_time.tv_sec = 0;  	timekeeper.total_sleep_time.tv_nsec = 0;  	write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -612,6 +628,12 @@ void __init timekeeping_init(void)  /* time in seconds when suspend began */  static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ +	timekeeper.total_sleep_time = t; +	timekeeper.offs_boot = timespec_to_ktime(t); +} +  /**   * __timekeeping_inject_sleeptime - Internal function to add sleep interval   * @delta: pointer to a timespec delta value @@ -630,8 +652,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)  	timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);  	timekeeper.wall_to_monotonic =  			timespec_sub(timekeeper.wall_to_monotonic, *delta); -	timekeeper.total_sleep_time = timespec_add( -					timekeeper.total_sleep_time, *delta); +	update_sleep_time(timespec_add(timekeeper.total_sleep_time, *delta));  } @@ -963,6 +984,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)  		leap = second_overflow(timekeeper.xtime.tv_sec);  		timekeeper.xtime.tv_sec += leap;  		timekeeper.wall_to_monotonic.tv_sec -= leap; +		if (leap) +			clock_was_set_delayed();  	}  	/* Accumulate raw time */ @@ -1079,6 +1102,8 @@ static void update_wall_time(void)  		leap = second_overflow(timekeeper.xtime.tv_sec);  		timekeeper.xtime.tv_sec += leap;  		timekeeper.wall_to_monotonic.tv_sec -= leap; +		if (leap) +			clock_was_set_delayed();  	}  	timekeeping_update(false); @@ -1246,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,  	} while (read_seqretry(&timekeeper.lock, seq));  } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real:	pointer to storage for monotonic -> realtime offset + * @offs_boot:	pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ +	ktime_t now; +	unsigned int seq; +	u64 secs, nsecs; + +	do { +		seq = read_seqbegin(&timekeeper.lock); + +		secs = timekeeper.xtime.tv_sec; +		nsecs = timekeeper.xtime.tv_nsec; +		nsecs += timekeeping_get_ns(); +		/* If arch requires, add in gettimeoffset() */ +		nsecs += arch_gettimeoffset(); + +		*offs_real = timekeeper.offs_real; +		*offs_boot = timekeeper.offs_boot; +	} while (read_seqretry(&timekeeper.lock, seq)); + +	now = ktime_add_ns(ktime_set(secs, 0), nsecs); +	now = ktime_sub(now, *offs_real); +	return now; +} +#endif +  /**   * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format   */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d0f6a8a0e5..f765465bffe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1075,6 +1075,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)  	rb_init_page(bpage->page);  	INIT_LIST_HEAD(&cpu_buffer->reader_page->list); +	INIT_LIST_HEAD(&cpu_buffer->new_pages);  	ret = rb_allocate_pages(cpu_buffer, nr_pages);  	if (ret < 0) @@ -1346,10 +1347,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)  			 * If something was added to this page, it was full  			 * since it is not the tail page. So we deduct the  			 * bytes consumed in ring buffer from here. -			 * No need to update overruns, since this page is -			 * deleted from ring buffer and its entries are -			 * already accounted for. +			 * Increment overrun to account for the lost events.  			 */ +			local_add(page_entries, &cpu_buffer->overrun);  			local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);  		} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690..a7fa0702be1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  		.pages		= pages_def,  		.partial	= partial_def,  		.nr_pages	= 0, /* This gets updated below. */ +		.nr_pages_max	= PIPE_DEF_BUFFERS,  		.flags		= flags,  		.ops		= &tracing_pipe_buf_ops,  		.spd_release	= tracing_spd_release_pipe, @@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,  	ret = splice_to_pipe(pipe, &spd);  out: -	splice_shrink_spd(pipe, &spd); +	splice_shrink_spd(&spd);  	return ret;  out_err: @@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	struct splice_pipe_desc spd = {  		.pages		= pages_def,  		.partial	= partial_def, +		.nr_pages_max	= PIPE_DEF_BUFFERS,  		.flags		= flags,  		.ops		= &buffer_pipe_buf_ops,  		.spd_release	= buffer_spd_release, @@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,  	}  	ret = splice_to_pipe(pipe, &spd); -	splice_shrink_spd(pipe, &spd); +	splice_shrink_spd(&spd);  out:  	return ret;  }  |