diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 77 | ||||
| -rw-r--r-- | kernel/acct.c | 6 | ||||
| -rw-r--r-- | kernel/audit.c | 2 | ||||
| -rw-r--r-- | kernel/audit.h | 7 | ||||
| -rw-r--r-- | kernel/audit_watch.c | 3 | ||||
| -rw-r--r-- | kernel/auditfilter.c | 65 | ||||
| -rw-r--r-- | kernel/auditsc.c | 217 | ||||
| -rw-r--r-- | kernel/debug/debug_core.c | 18 | ||||
| -rw-r--r-- | kernel/debug/kdb/kdb_bt.c | 2 | ||||
| -rw-r--r-- | kernel/debug/kdb/kdb_io.c | 33 | ||||
| -rw-r--r-- | kernel/debug/kdb/kdb_main.c | 2 | ||||
| -rw-r--r-- | kernel/events/core.c | 21 | ||||
| -rw-r--r-- | kernel/irq/irqdomain.c | 33 | ||||
| -rw-r--r-- | kernel/kmod.c | 7 | ||||
| -rw-r--r-- | kernel/kthread.c | 1 | ||||
| -rw-r--r-- | kernel/modsign_pubkey.c | 113 | ||||
| -rw-r--r-- | kernel/module-internal.h | 15 | ||||
| -rw-r--r-- | kernel/module.c | 157 | ||||
| -rw-r--r-- | kernel/module_signing.c | 243 | ||||
| -rw-r--r-- | kernel/rcutree.c | 21 | ||||
| -rw-r--r-- | kernel/rcutree.h | 6 | ||||
| -rw-r--r-- | kernel/sched/core.c | 71 | ||||
| -rw-r--r-- | kernel/time.c | 2 | ||||
| -rw-r--r-- | kernel/time/Kconfig | 4 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 118 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 32 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 117 | ||||
| -rw-r--r-- | kernel/timer.c | 10 | 
29 files changed, 1081 insertions, 324 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 5404911eaee..0dfeca4324e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o  obj-$(CONFIG_PROVE_LOCKING) += spinlock.o  obj-$(CONFIG_UID16) += uid16.o  obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o  obj-$(CONFIG_KALLSYMS) += kallsyms.o  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o  obj-$(CONFIG_KEXEC) += kexec.o @@ -130,3 +131,79 @@ quiet_cmd_timeconst  = TIMEC   $@  targets += timeconst.h  $(obj)/timeconst.h: $(src)/timeconst.pl FORCE  	$(call if_changed,timeconst) + +ifeq ($(CONFIG_MODULE_SIG),y) +# +# Pull the signing certificate and any extra certificates into the kernel +# +extra_certificates: +	touch $@ + +kernel/modsign_pubkey.o: signing_key.x509 extra_certificates + +############################################################################### +# +# If module signing is requested, say by allyesconfig, but a key has not been +# supplied, then one will need to be generated to make sure the build does not +# fail and that the kernel may be used afterwards. +# +############################################################################### +sign_key_with_hash := +ifeq ($(CONFIG_MODULE_SIG_SHA1),y) +sign_key_with_hash := -sha1 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA224),y) +sign_key_with_hash := -sha224 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA256),y) +sign_key_with_hash := -sha256 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA384),y) +sign_key_with_hash := -sha384 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA512),y) +sign_key_with_hash := -sha512 +endif +ifeq ($(sign_key_with_hash),) +$(error Could not determine digest type to use from kernel config) +endif + +signing_key.priv signing_key.x509: x509.genkey +	@echo "###" +	@echo "### Now generating an X.509 key pair to be used for signing modules." +	@echo "###" +	@echo "### If this takes a long time, you might wish to run rngd in the" +	@echo "### background to keep the supply of entropy topped up.  It" +	@echo "### needs to be run as root, and should use a hardware random" +	@echo "### number generator if one is available, eg:" +	@echo "###" +	@echo "###     rngd -r /dev/hwrandom" +	@echo "###" +	openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ +		-x509 -config x509.genkey \ +		-outform DER -out signing_key.x509 \ +		-keyout signing_key.priv +	@echo "###" +	@echo "### Key pair generated." +	@echo "###" + +x509.genkey: +	@echo Generating X.509 key generation config +	@echo  >x509.genkey "[ req ]" +	@echo >>x509.genkey "default_bits = 4096" +	@echo >>x509.genkey "distinguished_name = req_distinguished_name" +	@echo >>x509.genkey "prompt = no" +	@echo >>x509.genkey "string_mask = utf8only" +	@echo >>x509.genkey "x509_extensions = myexts" +	@echo >>x509.genkey +	@echo >>x509.genkey "[ req_distinguished_name ]" +	@echo >>x509.genkey "O = Magrathea" +	@echo >>x509.genkey "CN = Glacier signing key" +	@echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" +	@echo >>x509.genkey +	@echo >>x509.genkey "[ myexts ]" +	@echo >>x509.genkey "basicConstraints=critical,CA:FALSE" +	@echo >>x509.genkey "keyUsage=digitalSignature" +	@echo >>x509.genkey "subjectKeyIdentifier=hash" +	@echo >>x509.genkey "authorityKeyIdentifier=keyid" +endif diff --git a/kernel/acct.c b/kernel/acct.c index 6cd7529c9e6..051e071a06e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,  	}  } -static int acct_on(char *name) +static int acct_on(struct filename *pathname)  {  	struct file *file;  	struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(char *name)  	struct bsd_acct_struct *acct = NULL;  	/* Difference from BSD - they don't do O_APPEND */ -	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); +	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);  	if (IS_ERR(file))  		return PTR_ERR(file); @@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)  		return -EPERM;  	if (name) { -		char *tmp = getname(name); +		struct filename *tmp = getname(name);  		if (IS_ERR(tmp))  			return (PTR_ERR(tmp));  		error = acct_on(tmp); diff --git a/kernel/audit.c b/kernel/audit.c index 4d0ceede331..40414e9143d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1440,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)  	ab = audit_log_start(current->audit_context, GFP_KERNEL,  			     AUDIT_ANOM_LINK); +	if (!ab) +		return;  	audit_log_format(ab, "op=%s action=denied", operation);  	audit_log_format(ab, " pid=%d comm=", current->pid);  	audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 9eb3d79482b..d51cba868e1 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,12 +74,15 @@ static inline int audit_hash_ino(u32 ino)  	return (ino & (AUDIT_INODE_BUCKETS-1));  } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 +  extern int audit_match_class(int class, unsigned syscall);  extern int audit_comparator(const u32 left, const u32 op, const u32 right);  extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);  extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); -extern int audit_compare_dname_path(const char *dname, const char *path, -				    int *dirlen); +extern int parent_len(const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen);  extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,  					     int done, int multi,  					     const void *payload, int size); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c22ec3d87b..9a9ae6e3d29 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,  	/* Run all of the watches on this parent looking for the one that  	 * matches the given dname */  	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { -		if (audit_compare_dname_path(dname, owatch->path, NULL)) +		if (audit_compare_dname_path(dname, owatch->path, +					     AUDIT_NAME_FULL))  			continue;  		/* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index c4bcdbaf4d4..7f19f23d38a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1298,41 +1298,60 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)  	}  } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, -			     int *dirlen) +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path)  { -	int dlen, plen; +	int plen;  	const char *p; -	if (!dname || !path) -		return 1; - -	dlen = strlen(dname);  	plen = strlen(path); -	if (plen < dlen) -		return 1; + +	if (plen == 0) +		return plen;  	/* disregard trailing slashes */  	p = path + plen - 1;  	while ((*p == '/') && (p > path))  		p--; -	/* find last path component */ -	p = p - dlen + 1; -	if (p < path) +	/* walk backward until we find the next slash or hit beginning */ +	while ((*p != '/') && (p > path)) +		p--; + +	/* did we find a slash? Then increment to include it in path */ +	if (*p == '/') +		p++; + +	return p - path; +} + +/** + * audit_compare_dname_path - compare given dentry name with last component in + * 			      given path. Return of 0 indicates a match. + * @dname:	dentry name that we're comparing + * @path:	full pathname that we're comparing + * @parentlen:	length of the parent if known. Passing in AUDIT_NAME_FULL + * 		here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +{ +	int dlen, pathlen; +	const char *p; + +	dlen = strlen(dname); +	pathlen = strlen(path); +	if (pathlen < dlen)  		return 1; -	else if (p > path) { -		if (*--p != '/') -			return 1; -		else -			p++; -	} -	/* return length of path's directory component */ -	if (dirlen) -		*dirlen = p - path; +	parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; +	if (pathlen - parentlen != dlen) +		return 1; + +	p = path + parentlen; +  	return strncmp(p, dname, dlen);  } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f4a7756f999..2f186ed80c4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@   * a name dynamically and also add those to the list anchored by names_list. */  #define AUDIT_NAMES	5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 -  /* no execve audit message should be longer than this (userspace limits) */  #define MAX_EXECVE_AUDIT_LEN 7500 @@ -106,27 +103,29 @@ struct audit_cap_data {   * we don't let putname() free it (instead we free all of the saved   * pointers at syscall exit time).   * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */  struct audit_names { -	struct list_head list;		/* audit_context->names_list */ -	const char	*name; -	unsigned long	ino; -	dev_t		dev; -	umode_t		mode; -	kuid_t		uid; -	kgid_t		gid; -	dev_t		rdev; -	u32		osid; -	struct audit_cap_data fcap; -	unsigned int	fcap_ver; -	int		name_len;	/* number of name's characters to log */ -	bool		name_put;	/* call __putname() for this name */ +	struct list_head	list;		/* audit_context->names_list */ +	struct filename	*name; +	unsigned long		ino; +	dev_t			dev; +	umode_t			mode; +	kuid_t			uid; +	kgid_t			gid; +	dev_t			rdev; +	u32			osid; +	struct audit_cap_data	 fcap; +	unsigned int		fcap_ver; +	int			name_len;	/* number of name's characters to log */ +	unsigned char		type;		/* record type */ +	bool			name_put;	/* call __putname() for this name */  	/*  	 * This was an allocated audit_names and not from the array of  	 * names allocated in the task audit context.  Thus this name  	 * should be freed on syscall exit  	 */ -	bool		should_free; +	bool			should_free;  };  struct audit_aux_data { @@ -998,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)  		       context->ino_count);  		list_for_each_entry(n, &context->names_list, list) {  			printk(KERN_ERR "names[%d] = %p = %s\n", i, -			       n->name, n->name ?: "(null)"); +			       n->name, n->name->name ?: "(null)");  		}  		dump_stack();  		return; @@ -1555,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  		case AUDIT_NAME_FULL:  			/* log the full path */  			audit_log_format(ab, " name="); -			audit_log_untrustedstring(ab, n->name); +			audit_log_untrustedstring(ab, n->name->name);  			break;  		case 0:  			/* name was specified as a relative path and the @@ -1565,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  		default:  			/* log the name's directory component */  			audit_log_format(ab, " name="); -			audit_log_n_untrustedstring(ab, n->name, +			audit_log_n_untrustedstring(ab, n->name->name,  						    n->name_len);  		}  	} else @@ -1995,7 +1994,8 @@ retry:  #endif  } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, +						unsigned char type)  {  	struct audit_names *aname; @@ -2010,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)  	}  	aname->ino = (unsigned long)-1; +	aname->type = type;  	list_add_tail(&aname->list, &context->names_list);  	context->name_count++; @@ -2020,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)  }  /** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ +	struct audit_context *context = current->audit_context; +	struct audit_names *n; + +	list_for_each_entry(n, &context->names_list, list) { +		if (!n->name) +			continue; +		if (n->name->uptr == uptr) +			return n->name; +	} +	return NULL; +} + +/**   * audit_getname - add a name to the list   * @name: name to add   *   * Add a name to the list of audit names for this context.   * Called from fs/namei.c:getname().   */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name)  {  	struct audit_context *context = current->audit_context;  	struct audit_names *n; @@ -2040,13 +2064,19 @@ void __audit_getname(const char *name)  		return;  	} -	n = audit_alloc_name(context); +#if AUDIT_DEBUG +	/* The filename _must_ have a populated ->name */ +	BUG_ON(!name->name); +#endif + +	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);  	if (!n)  		return;  	n->name = name;  	n->name_len = AUDIT_NAME_FULL;  	n->name_put = true; +	name->aname = n;  	if (!context->pwd.dentry)  		get_fs_pwd(current->fs, &context->pwd); @@ -2059,7 +2089,7 @@ void __audit_getname(const char *name)   * then we delay the putname until syscall exit.   * Called from include/linux/fs.h:putname().   */ -void audit_putname(const char *name) +void audit_putname(struct filename *name)  {  	struct audit_context *context = current->audit_context; @@ -2074,7 +2104,7 @@ void audit_putname(const char *name)  			list_for_each_entry(n, &context->names_list, list)  				printk(KERN_ERR "name[%d] = %p = %s\n", i, -				       n->name, n->name ?: "(null)"); +				       n->name, n->name->name ?: "(null)");  			}  #endif  		__putname(name); @@ -2088,8 +2118,8 @@ void audit_putname(const char *name)  			       " put_count=%d\n",  			       __FILE__, __LINE__,  			       context->serial, context->major, -			       context->in_syscall, name, context->name_count, -			       context->put_count); +			       context->in_syscall, name->name, +			       context->name_count, context->put_count);  			dump_stack();  		}  	} @@ -2132,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent  }  /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup   * @name: name being audited   * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent?   */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(struct filename *name, const struct dentry *dentry, +		   unsigned int parent)  {  	struct audit_context *context = current->audit_context;  	const struct inode *inode = dentry->d_inode; @@ -2147,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)  	if (!context->in_syscall)  		return; +	if (!name) +		goto out_alloc; + +#if AUDIT_DEBUG +	/* The struct filename _must_ have a populated ->name */ +	BUG_ON(!name->name); +#endif +	/* +	 * If we have a pointer to an audit_names entry already, then we can +	 * just use it directly if the type is correct. +	 */ +	n = name->aname; +	if (n) { +		if (parent) { +			if (n->type == AUDIT_TYPE_PARENT || +			    n->type == AUDIT_TYPE_UNKNOWN) +				goto out; +		} else { +			if (n->type != AUDIT_TYPE_PARENT) +				goto out; +		} +	} +  	list_for_each_entry_reverse(n, &context->names_list, list) { -		if (n->name && (n->name == name)) -			goto out; +		/* does the name pointer match? */ +		if (!n->name || n->name->name != name->name) +			continue; + +		/* match the correct record type */ +		if (parent) { +			if (n->type == AUDIT_TYPE_PARENT || +			    n->type == AUDIT_TYPE_UNKNOWN) +				goto out; +		} else { +			if (n->type != AUDIT_TYPE_PARENT) +				goto out; +		}  	} -	/* unable to find the name from a previous getname() */ -	n = audit_alloc_name(context); +out_alloc: +	/* unable to find the name from a previous getname(). Allocate a new +	 * anonymous entry. +	 */ +	n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);  	if (!n)  		return;  out: +	if (parent) { +		n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; +		n->type = AUDIT_TYPE_PARENT; +	} else { +		n->name_len = AUDIT_NAME_FULL; +		n->type = AUDIT_TYPE_NORMAL; +	}  	handle_path(dentry);  	audit_copy_inode(n, dentry, inode);  }  /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects   * @parent: inode of dentry parent + * @dentry: dentry being audited + * @type:   AUDIT_TYPE_* value that we're looking for   *   * For syscalls that create or remove filesystem objects, audit_inode   * can only collect information for the filesystem object's parent. @@ -2174,15 +2249,14 @@ out:   * must be hooked prior, in order to capture the target inode during   * unsuccessful attempts.   */ -void __audit_inode_child(const struct dentry *dentry, -			 const struct inode *parent) +void __audit_inode_child(const struct inode *parent, +			 const struct dentry *dentry, +			 const unsigned char type)  {  	struct audit_context *context = current->audit_context; -	const char *found_parent = NULL, *found_child = NULL;  	const struct inode *inode = dentry->d_inode;  	const char *dname = dentry->d_name.name; -	struct audit_names *n; -	int dirlen = 0; +	struct audit_names *n, *found_parent = NULL, *found_child = NULL;  	if (!context->in_syscall)  		return; @@ -2190,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,  	if (inode)  		handle_one(inode); -	/* parent is more likely, look for it first */ +	/* look for a parent entry first */  	list_for_each_entry(n, &context->names_list, list) { -		if (!n->name) +		if (!n->name || n->type != AUDIT_TYPE_PARENT)  			continue;  		if (n->ino == parent->i_ino && -		    !audit_compare_dname_path(dname, n->name, &dirlen)) { -			n->name_len = dirlen; /* update parent data in place */ -			found_parent = n->name; -			goto add_names; +		    !audit_compare_dname_path(dname, n->name->name, n->name_len)) { +			found_parent = n; +			break;  		}  	} -	/* no matching parent, look for matching child */ +	/* is there a matching child entry? */  	list_for_each_entry(n, &context->names_list, list) { -		if (!n->name) +		/* can only match entries that have a name */ +		if (!n->name || n->type != type)  			continue; -		/* strcmp() is the more likely scenario */ -		if (!strcmp(dname, n->name) || -		     !audit_compare_dname_path(dname, n->name, &dirlen)) { -			if (inode) -				audit_copy_inode(n, NULL, inode); -			else -				n->ino = (unsigned long)-1; -			found_child = n->name; -			goto add_names; +		/* if we found a parent, make sure this one is a child of it */ +		if (found_parent && (n->name != found_parent->name)) +			continue; + +		if (!strcmp(dname, n->name->name) || +		    !audit_compare_dname_path(dname, n->name->name, +						found_parent ? +						found_parent->name_len : +						AUDIT_NAME_FULL)) { +			found_child = n; +			break;  		}  	} -add_names:  	if (!found_parent) { -		n = audit_alloc_name(context); +		/* create a new, "anonymous" parent record */ +		n = audit_alloc_name(context, AUDIT_TYPE_PARENT);  		if (!n)  			return;  		audit_copy_inode(n, NULL, parent);  	}  	if (!found_child) { -		n = audit_alloc_name(context); -		if (!n) +		found_child = audit_alloc_name(context, type); +		if (!found_child)  			return;  		/* Re-use the name belonging to the slot for a matching parent  		 * directory. All names for this context are relinquished in  		 * audit_free_names() */  		if (found_parent) { -			n->name = found_parent; -			n->name_len = AUDIT_NAME_FULL; +			found_child->name = found_parent->name; +			found_child->name_len = AUDIT_NAME_FULL;  			/* don't call __putname() */ -			n->name_put = false; +			found_child->name_put = false;  		} - -		if (inode) -			audit_copy_inode(n, NULL, inode);  	} +	if (inode) +		audit_copy_inode(found_child, dentry, inode); +	else +		found_child->ino = (unsigned long)-1;  }  EXPORT_SYMBOL_GPL(__audit_inode_child); diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 17e073c309e..9a61738cefc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -696,6 +696,22 @@ out:  	return ret;  } +/* + * GDB places a breakpoint at this function to know dynamically + * loaded objects. It's not defined static so that only one instance with this + * name exists in the kernel. + */ + +static int module_event(struct notifier_block *self, unsigned long val, +	void *data) +{ +	return 0; +} + +static struct notifier_block dbg_module_load_nb = { +	.notifier_call	= module_event, +}; +  int kgdb_nmicallback(int cpu, void *regs)  {  #ifdef CONFIG_SMP @@ -824,6 +840,7 @@ static void kgdb_register_callbacks(void)  		kgdb_arch_init();  		if (!dbg_is_early)  			kgdb_arch_late(); +		register_module_notifier(&dbg_module_load_nb);  		register_reboot_notifier(&dbg_reboot_notifier);  		atomic_notifier_chain_register(&panic_notifier_list,  					       &kgdb_panic_event_nb); @@ -847,6 +864,7 @@ static void kgdb_unregister_callbacks(void)  	if (kgdb_io_module_registered) {  		kgdb_io_module_registered = 0;  		unregister_reboot_notifier(&dbg_reboot_notifier); +		unregister_module_notifier(&dbg_module_load_nb);  		atomic_notifier_chain_unregister(&panic_notifier_list,  					       &kgdb_panic_event_nb);  		kgdb_arch_exit(); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0..b03e0e814e4 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)  		}  		/* Now the inactive tasks */  		kdb_do_each_thread(g, p) { +			if (KDB_FLAG(CMD_INTERRUPT)) +				return 0;  			if (task_curr(p))  				continue;  			if (kdb_bt1(p, mask, argcount, btaprompt)) diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f..14ff4849262 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)  {  	int diag;  	int linecount; +	int colcount;  	int logging, saved_loglevel = 0;  	int saved_trap_printk;  	int got_printf_lock = 0; @@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)  	if (diag || linecount <= 1)  		linecount = 24; +	diag = kdbgetintenv("COLUMNS", &colcount); +	if (diag || colcount <= 1) +		colcount = 80; +  	diag = kdbgetintenv("LOGGING", &logging);  	if (diag)  		logging = 0; @@ -690,7 +695,7 @@ kdb_printit:  		gdbstub_msg_write(kdb_buffer, retlen);  	} else {  		if (dbg_io_ops && !dbg_io_ops->is_console) { -			len = strlen(kdb_buffer); +			len = retlen;  			cp = kdb_buffer;  			while (len--) {  				dbg_io_ops->write_char(*cp); @@ -709,11 +714,29 @@ kdb_printit:  		printk(KERN_INFO "%s", kdb_buffer);  	} -	if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) -		kdb_nextline++; +	if (KDB_STATE(PAGER)) { +		/* +		 * Check printed string to decide how to bump the +		 * kdb_nextline to control when the more prompt should +		 * show up. +		 */ +		int got = 0; +		len = retlen; +		while (len--) { +			if (kdb_buffer[len] == '\n') { +				kdb_nextline++; +				got = 0; +			} else if (kdb_buffer[len] == '\r') { +				got = 0; +			} else { +				got++; +			} +		} +		kdb_nextline += got / (colcount + 1); +	}  	/* check for having reached the LINES number of printed lines */ -	if (kdb_nextline == linecount) { +	if (kdb_nextline >= linecount) {  		char buf1[16] = "";  		/* Watch out for recursion here.  Any routine that calls @@ -765,7 +788,7 @@ kdb_printit:  			kdb_grepping_flag = 0;  			kdb_printf("\n");  		} else if (buf1[0] == ' ') { -			kdb_printf("\n"); +			kdb_printf("\r");  			suspend_grep = 1; /* for this recursion */  		} else if (buf1[0] == '\n') {  			kdb_nextline = linecount - 1; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1261dc7eaeb..4d5f8d5612f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2101,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)  		}  		if (!lines--)  			break; +		if (KDB_FLAG(CMD_INTERRUPT)) +			return 0;  		kdb_printf("%.*s\n", (int)len - 1, buf);  	} diff --git a/kernel/events/core.c b/kernel/events/core.c index cda3ebd49e8..dbccf83c134 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); +		if (cpuctx->unique_pmu != pmu) +			continue; /* ensure we process each cpuctx once */  		/*  		 * perf_cgroup_events says at least one @@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)  			if (mode & PERF_CGROUP_SWIN) {  				WARN_ON_ONCE(cpuctx->cgrp); -				/* set cgrp before ctxsw in to -				 * allow event_filter_match() to not -				 * have to pass task around +				/* +				 * set cgrp before ctxsw in to allow +				 * event_filter_match() to not have to pass +				 * task around  				 */  				cpuctx->cgrp = perf_cgroup_from_task(task);  				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -4412,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4558,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4754,7 +4757,7 @@ got_name:  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,  					vma->vm_flags & VM_EXEC); @@ -5855,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)  		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); -		if (cpuctx->active_pmu == old_pmu) -			cpuctx->active_pmu = pmu; +		if (cpuctx->unique_pmu == old_pmu) +			cpuctx->unique_pmu = pmu;  	}  } @@ -5991,7 +5994,7 @@ skip_type:  		cpuctx->ctx.pmu = pmu;  		cpuctx->jiffies_interval = 1;  		INIT_LIST_HEAD(&cpuctx->rotation_list); -		cpuctx->active_pmu = pmu; +		cpuctx->unique_pmu = pmu;  	}  got_cpu_context: diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db4..4e69e24d3d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,   * @host_data: Controller private data pointer   *   * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated.   *   * This is intended to implement the expected behaviour for most   * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,  					 const struct irq_domain_ops *ops,  					 void *host_data)  { -	if (first_irq > 0) -		return irq_domain_add_legacy(of_node, size, first_irq, 0, +	if (first_irq > 0) { +		int irq_base; + +		if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { +			/* +			 * Set the descriptor allocator to search for a +			 * 1-to-1 mapping, such as irq_alloc_desc_at(). +			 * Use of_node_to_nid() which is defined to +			 * numa_node_id() on platforms that have no custom +			 * implementation. +			 */ +			irq_base = irq_alloc_descs(first_irq, first_irq, size, +						   of_node_to_nid(of_node)); +			if (irq_base < 0) { +				WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", +				     first_irq); +				irq_base = first_irq; +			} +		} else +			irq_base = first_irq; + +		return irq_domain_add_legacy(of_node, size, irq_base, 0,  					     ops, host_data); -	else -		return irq_domain_add_linear(of_node, size, ops, host_data); +	} + +	/* A linear domain is the default */ +	return irq_domain_add_linear(of_node, size, ops, host_data);  }  /** diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c..1c317e38683 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@  #include <linux/notifier.h>  #include <linux/suspend.h>  #include <linux/rwsem.h> +#include <linux/ptrace.h>  #include <asm/uaccess.h>  #include <trace/events/module.h> @@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)  	retval = kernel_execve(sub_info->path,  			       (const char *const *)sub_info->argv,  			       (const char *const *)sub_info->envp); +	if (!retval) +		return 0;  	/* Exec failed? */  fail:  	sub_info->retval = retval; -	return 0; +	do_exit(0);  }  static int call_helper(void *data) @@ -292,7 +295,7 @@ static int wait_for_helper(void *data)  	}  	umh_complete(sub_info); -	return 0; +	do_exit(0);  }  /* This is run by khelper thread  */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 146a6fa9682..29fb60caecb 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@  #include <linux/mutex.h>  #include <linux/slab.h>  #include <linux/freezer.h> +#include <linux/ptrace.h>  #include <trace/events/sched.h>  static DEFINE_SPINLOCK(kthread_create_lock); diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 00000000000..4646eb2c382 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,113 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initdata const u8 modsign_certificate_list[]; +extern __initdata const u8 modsign_certificate_list_end[]; +asm(".section .init.data,\"aw\"\n" +    "modsign_certificate_list:\n" +    ".incbin \"signing_key.x509\"\n" +    ".incbin \"extra_certificates\"\n" +    "modsign_certificate_list_end:" +    ); + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initdata const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ +	pr_notice("Initialise module verification\n"); + +	modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", +				    KUIDT_INIT(0), KGIDT_INIT(0), +				    current_cred(), +				    (KEY_POS_ALL & ~KEY_POS_SETATTR) | +				    KEY_USR_VIEW | KEY_USR_READ, +				    KEY_ALLOC_NOT_IN_QUOTA); +	if (IS_ERR(modsign_keyring)) +		panic("Can't allocate module signing keyring\n"); + +	if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) +		panic("Can't instantiate module signing keyring\n"); + +	return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ +	key_ref_t key; +	const u8 *p, *end; +	size_t plen; + +	pr_notice("Loading module verification certificates\n"); + +	end = modsign_certificate_list_end; +	p = modsign_certificate_list; +	while (p < end) { +		/* Each cert begins with an ASN.1 SEQUENCE tag and must be more +		 * than 256 bytes in size. +		 */ +		if (end - p < 4) +			goto dodgy_cert; +		if (p[0] != 0x30 && +		    p[1] != 0x82) +			goto dodgy_cert; +		plen = (p[2] << 8) | p[3]; +		plen += 4; +		if (plen > end - p) +			goto dodgy_cert; + +		key = key_create_or_update(make_key_ref(modsign_keyring, 1), +					   "asymmetric", +					   NULL, +					   p, +					   plen, +					   (KEY_POS_ALL & ~KEY_POS_SETATTR) | +					   KEY_USR_VIEW, +					   KEY_ALLOC_NOT_IN_QUOTA); +		if (IS_ERR(key)) +			pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", +			       PTR_ERR(key)); +		else +			pr_notice("MODSIGN: Loaded cert '%s'\n", +				  key_ref_to_ptr(key)->description); +		p += plen; +	} + +	return 0; + +dodgy_cert: +	pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); +	return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 00000000000..6114a13419b --- /dev/null +++ b/kernel/module-internal.h @@ -0,0 +1,15 @@ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +extern struct key *modsign_keyring; + +extern int mod_verify_sig(const void *mod, unsigned long modlen, +			  const void *sig, unsigned long siglen); diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11ac..0e2da8695f8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -58,6 +58,8 @@  #include <linux/jump_label.h>  #include <linux/pfn.h>  #include <linux/bsearch.h> +#include <linux/fips.h> +#include "module-internal.h"  #define CREATE_TRACE_POINTS  #include <trace/events/module.h> @@ -102,6 +104,43 @@ static LIST_HEAD(modules);  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */  #endif /* CONFIG_KGDB_KDB */ +#ifdef CONFIG_MODULE_SIG +#ifdef CONFIG_MODULE_SIG_FORCE +static bool sig_enforce = true; +#else +static bool sig_enforce = false; + +static int param_set_bool_enable_only(const char *val, +				      const struct kernel_param *kp) +{ +	int err; +	bool test; +	struct kernel_param dummy_kp = *kp; + +	dummy_kp.arg = &test; + +	err = param_set_bool(val, &dummy_kp); +	if (err) +		return err; + +	/* Don't let them unset it once it's set! */ +	if (!test && sig_enforce) +		return -EROFS; + +	if (test) +		sig_enforce = true; +	return 0; +} + +static const struct kernel_param_ops param_ops_bool_enable_only = { +	.set = param_set_bool_enable_only, +	.get = param_get_bool, +}; +#define param_check_bool_enable_only param_check_bool + +module_param(sig_enforce, bool_enable_only, 0644); +#endif /* !CONFIG_MODULE_SIG_FORCE */ +#endif /* CONFIG_MODULE_SIG */  /* Block module loading/unloading? */  int modules_disabled = 0; @@ -136,6 +175,7 @@ struct load_info {  	unsigned long symoffs, stroffs;  	struct _ddebug *debug;  	unsigned int num_debug; +	bool sig_ok;  	struct {  		unsigned int sym, str, mod, vers, info, pcpu;  	} index; @@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)  	return ret;  } -int __weak apply_relocate(Elf_Shdr *sechdrs, -			  const char *strtab, -			  unsigned int symindex, -			  unsigned int relsec, -			  struct module *me) -{ -	pr_err("module %s: REL relocation unsupported\n", me->name); -	return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, -			      const char *strtab, -			      unsigned int symindex, -			      unsigned int relsec, -			      struct module *me) -{ -	pr_err("module %s: RELA relocation unsupported\n", me->name); -	return -ENOEXEC; -} -  static int apply_relocations(struct module *mod, const struct load_info *info)  {  	unsigned int i; @@ -2399,7 +2419,52 @@ static inline void kmemleak_load_module(const struct module *mod,  }  #endif -/* Sets info->hdr and info->len. */ +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info, +			    const void *mod, unsigned long *len) +{ +	int err = -ENOKEY; +	const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; +	const void *p = mod, *end = mod + *len; + +	/* Poor man's memmem. */ +	while ((p = memchr(p, MODULE_SIG_STRING[0], end - p))) { +		if (p + markerlen > end) +			break; + +		if (memcmp(p, MODULE_SIG_STRING, markerlen) == 0) { +			const void *sig = p + markerlen; +			/* Truncate module up to signature. */ +			*len = p - mod; +			err = mod_verify_sig(mod, *len, sig, end - sig); +			break; +		} +		p++; +	} + +	if (!err) { +		info->sig_ok = true; +		return 0; +	} + +	/* Not having a signature is only an error if we're strict. */ +	if (err < 0 && fips_enabled) +		panic("Module verification failed with error %d in FIPS mode\n", +		      err); +	if (err == -ENOKEY && !sig_enforce) +		err = 0; + +	return err; +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info, +			    void *mod, unsigned long *len) +{ +	return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +/* Sets info->hdr, info->len and info->sig_ok. */  static int copy_and_check(struct load_info *info,  			  const void __user *umod, unsigned long len,  			  const char __user *uargs) @@ -2419,6 +2484,10 @@ static int copy_and_check(struct load_info *info,  		goto free_hdr;  	} +	err = module_sig_check(info, hdr, &len); +	if (err) +		goto free_hdr; +  	/* Sanity checks against insmoding binaries or wrong arch,  	   weird elf version */  	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 @@ -2730,6 +2799,10 @@ static int check_module_license_and_versions(struct module *mod)  	if (strcmp(mod->name, "driverloader") == 0)  		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +	/* lve claims to be GPL but upstream won't provide source */ +	if (strcmp(mod->name, "lve") == 0) +		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +  #ifdef CONFIG_MODVERSIONS  	if ((mod->num_syms && !mod->crcs)  	    || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2861,6 +2934,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)  	return module_finalize(info->hdr, info->sechdrs, mod);  } +/* Is this module of this name done loading?  No locks held. */ +static bool finished_loading(const char *name) +{ +	struct module *mod; +	bool ret; + +	mutex_lock(&module_mutex); +	mod = find_module(name); +	ret = !mod || mod->state != MODULE_STATE_COMING; +	mutex_unlock(&module_mutex); + +	return ret; +} +  /* Allocate and load the module: note that size of section 0 is always     zero, and we rely on this for optional sections. */  static struct module *load_module(void __user *umod, @@ -2868,7 +2955,7 @@ static struct module *load_module(void __user *umod,  				  const char __user *uargs)  {  	struct load_info info = { NULL, }; -	struct module *mod; +	struct module *mod, *old;  	long err;  	pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2886,6 +2973,12 @@ static struct module *load_module(void __user *umod,  		goto free_copy;  	} +#ifdef CONFIG_MODULE_SIG +	mod->sig_ok = info.sig_ok; +	if (!mod->sig_ok) +		add_taint_module(mod, TAINT_FORCED_MODULE); +#endif +  	/* Now module is in final location, initialize linked lists, etc. */  	err = module_unload_init(mod);  	if (err) @@ -2934,8 +3027,18 @@ static struct module *load_module(void __user *umod,  	 * function to insert in a way safe to concurrent readers.  	 * The mutex protects against concurrent writers.  	 */ +again:  	mutex_lock(&module_mutex); -	if (find_module(mod->name)) { +	if ((old = find_module(mod->name)) != NULL) { +		if (old->state == MODULE_STATE_COMING) { +			/* Wait in case it fails to load. */ +			mutex_unlock(&module_mutex); +			err = wait_event_interruptible(module_wq, +					       finished_loading(mod->name)); +			if (err) +				goto free_arch_cleanup; +			goto again; +		}  		err = -EEXIST;  		goto unlock;  	} @@ -2975,7 +3078,7 @@ static struct module *load_module(void __user *umod,  	/* Unlink carefully: kallsyms could be walking list. */  	list_del_rcu(&mod->list);  	module_bug_cleanup(mod); - +	wake_up_all(&module_wq);   ddebug:  	dynamic_debug_remove(info.debug);   unlock: @@ -3050,7 +3153,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  		blocking_notifier_call_chain(&module_notify_list,  					     MODULE_STATE_GOING, mod);  		free_module(mod); -		wake_up(&module_wq); +		wake_up_all(&module_wq);  		return ret;  	}  	if (ret > 0) { @@ -3062,9 +3165,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  		dump_stack();  	} -	/* Now it's a first class citizen!  Wake up anyone waiting for it. */ +	/* Now it's a first class citizen! */  	mod->state = MODULE_STATE_LIVE; -	wake_up(&module_wq);  	blocking_notifier_call_chain(&module_notify_list,  				     MODULE_STATE_LIVE, mod); @@ -3087,6 +3189,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  	mod->init_ro_size = 0;  	mod->init_text_size = 0;  	mutex_unlock(&module_mutex); +	wake_up_all(&module_wq);  	return 0;  } diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 00000000000..6b09f6983ac --- /dev/null +++ b/kernel/module_signing.c @@ -0,0 +1,243 @@ +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/err.h> +#include <crypto/public_key.h> +#include <crypto/hash.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + *	- Signer's name + *	- Key identifier + *	- Signature data + *	- Information block + */ +struct module_signature { +	enum pkey_algo		algo : 8;	/* Public-key crypto algorithm */ +	enum pkey_hash_algo	hash : 8;	/* Digest algorithm */ +	enum pkey_id_type	id_type : 8;	/* Key identifier type */ +	u8			signer_len;	/* Length of signer's name */ +	u8			key_id_len;	/* Length of key identifier */ +	u8			__pad[3]; +	__be32			sig_len;	/* Length of signature data */ +}; + +/* + * Digest the module contents. + */ +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +						    const void *mod, +						    unsigned long modlen) +{ +	struct public_key_signature *pks; +	struct crypto_shash *tfm; +	struct shash_desc *desc; +	size_t digest_size, desc_size; +	int ret; + +	pr_devel("==>%s()\n", __func__); +	 +	/* Allocate the hashing algorithm we're going to need and find out how +	 * big the hash operational data will be. +	 */ +	tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); +	if (IS_ERR(tfm)) +		return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); + +	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); +	digest_size = crypto_shash_digestsize(tfm); + +	/* We allocate the hash operational data storage on the end of our +	 * context data and the digest output buffer on the end of that. +	 */ +	ret = -ENOMEM; +	pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); +	if (!pks) +		goto error_no_pks; + +	pks->pkey_hash_algo	= hash; +	pks->digest		= (u8 *)pks + sizeof(*pks) + desc_size; +	pks->digest_size	= digest_size; + +	desc = (void *)pks + sizeof(*pks); +	desc->tfm   = tfm; +	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + +	ret = crypto_shash_init(desc); +	if (ret < 0) +		goto error; + +	ret = crypto_shash_finup(desc, mod, modlen, pks->digest); +	if (ret < 0) +		goto error; + +	crypto_free_shash(tfm); +	pr_devel("<==%s() = ok\n", __func__); +	return pks; + +error: +	kfree(pks); +error_no_pks: +	crypto_free_shash(tfm); +	pr_devel("<==%s() = %d\n", __func__, ret); +	return ERR_PTR(ret); +} + +/* + * Extract an MPI array from the signature data.  This represents the actual + * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the + * size of the MPI in bytes. + * + * RSA signatures only have one MPI, so currently we only read one. + */ +static int mod_extract_mpi_array(struct public_key_signature *pks, +				 const void *data, size_t len) +{ +	size_t nbytes; +	MPI mpi; + +	if (len < 3) +		return -EBADMSG; +	nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; +	data += 2; +	len -= 2; +	if (len != nbytes) +		return -EBADMSG; + +	mpi = mpi_read_raw_data(data, nbytes); +	if (!mpi) +		return -ENOMEM; +	pks->mpi[0] = mpi; +	pks->nr_mpi = 1; +	return 0; +} + +/* + * Request an asymmetric key. + */ +static struct key *request_asymmetric_key(const char *signer, size_t signer_len, +					  const u8 *key_id, size_t key_id_len) +{ +	key_ref_t key; +	size_t i; +	char *id, *q; + +	pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); + +	/* Construct an identifier. */ +	id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); +	if (!id) +		return ERR_PTR(-ENOKEY); + +	memcpy(id, signer, signer_len); + +	q = id + signer_len; +	*q++ = ':'; +	*q++ = ' '; +	for (i = 0; i < key_id_len; i++) { +		*q++ = hex_asc[*key_id >> 4]; +		*q++ = hex_asc[*key_id++ & 0x0f]; +	} + +	*q = 0; + +	pr_debug("Look up: \"%s\"\n", id); + +	key = keyring_search(make_key_ref(modsign_keyring, 1), +			     &key_type_asymmetric, id); +	if (IS_ERR(key)) +		pr_warn("Request for unknown module key '%s' err %ld\n", +			id, PTR_ERR(key)); +	kfree(id); + +	if (IS_ERR(key)) { +		switch (PTR_ERR(key)) { +			/* Hide some search errors */ +		case -EACCES: +		case -ENOTDIR: +		case -EAGAIN: +			return ERR_PTR(-ENOKEY); +		default: +			return ERR_CAST(key); +		} +	} + +	pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); +	return key_ref_to_ptr(key); +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, unsigned long modlen, +		   const void *sig, unsigned long siglen) +{ +	struct public_key_signature *pks; +	struct module_signature ms; +	struct key *key; +	size_t sig_len; +	int ret; + +	pr_devel("==>%s(,%lu,,%lu,)\n", __func__, modlen, siglen); + +	if (siglen <= sizeof(ms)) +		return -EBADMSG; + +	memcpy(&ms, sig + (siglen - sizeof(ms)), sizeof(ms)); +	siglen -= sizeof(ms); + +	sig_len = be32_to_cpu(ms.sig_len); +	if (sig_len >= siglen || +	    siglen - sig_len != (size_t)ms.signer_len + ms.key_id_len) +		return -EBADMSG; + +	/* For the moment, only support RSA and X.509 identifiers */ +	if (ms.algo != PKEY_ALGO_RSA || +	    ms.id_type != PKEY_ID_X509) +		return -ENOPKG; + +	if (ms.hash >= PKEY_HASH__LAST || +	    !pkey_hash_algo[ms.hash]) +		return -ENOPKG; + +	key = request_asymmetric_key(sig, ms.signer_len, +				     sig + ms.signer_len, ms.key_id_len); +	if (IS_ERR(key)) +		return PTR_ERR(key); + +	pks = mod_make_digest(ms.hash, mod, modlen); +	if (IS_ERR(pks)) { +		ret = PTR_ERR(pks); +		goto error_put_key; +	} + +	ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, +				    sig_len); +	if (ret < 0) +		goto error_free_pks; + +	ret = verify_signature(key, pks); +	pr_devel("verify_signature() = %d\n", ret); + +error_free_pks: +	mpi_free(pks->rsa.s); +	kfree(pks); +error_put_key: +	key_put(key); +	pr_devel("<==%s() = %d\n", __func__, ret); +	return ret;	 +} diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf0..74df86bd920 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];  	.orphan_nxttail = &sname##_state.orphan_nxtlist, \  	.orphan_donetail = &sname##_state.orphan_donelist, \  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ +	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \  	.name = #sname, \  } @@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  	raw_spin_unlock_irq(&rnp->lock);  	/* Exclude any concurrent CPU-hotplug operations. */ -	get_online_cpus(); +	mutex_lock(&rsp->onoff_mutex);  	/*  	 * Set the quiescent-state-needed bits in all the rcu_node @@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  		cond_resched();  	} -	put_online_cpus(); +	mutex_unlock(&rsp->onoff_mutex);  	return 1;  } @@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */  	/* Exclude any attempts to start a new grace period. */ +	mutex_lock(&rsp->onoff_mutex);  	raw_spin_lock_irqsave(&rsp->onofflock, flags);  	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ @@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	init_callback_list(rdp);  	/* Disallow further callbacks on this CPU. */  	rdp->nxttail[RCU_NEXT_TAIL] = NULL; +	mutex_unlock(&rsp->onoff_mutex);  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rcu_get_root(rsp); +	/* Exclude new grace periods. */ +	mutex_lock(&rsp->onoff_mutex); +  	/* Set up local state, ensuring consistent view of global state. */  	raw_spin_lock_irqsave(&rnp->lock, flags);  	rdp->beenonline = 1;	 /* We have now been online. */ @@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	rcu_prepare_for_idle_init(cpu);  	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */ -	/* -	 * A new grace period might start here.  If so, we won't be part -	 * of it, but that is OK, as we are currently in a quiescent state. -	 */ - -	/* Exclude any attempts to start a new GP on large systems. */ -	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */ -  	/* Add CPU to rcu_node bitmasks. */  	rnp = rdp->mynode;  	mask = rdp->grpmask; @@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */  		rnp = rnp->parent;  	} while (rnp != NULL && !(rnp->qsmaskinit & mask)); +	local_irq_restore(flags); -	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); +	mutex_unlock(&rsp->onoff_mutex);  }  static void __cpuinit rcu_prepare_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d6832..a240f032848 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,11 +394,17 @@ struct rcu_state {  	struct rcu_head **orphan_donetail;	/* Tail of above. */  	long qlen_lazy;				/* Number of lazy callbacks. */  	long qlen;				/* Total number of callbacks. */ +	/* End of fields guarded by onofflock. */ + +	struct mutex onoff_mutex;		/* Coordinate hotplug & GPs. */ +  	struct mutex barrier_mutex;		/* Guards barrier fields. */  	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */  	struct completion barrier_completion;	/* Wake at barrier end. */  	unsigned long n_barrier_done;		/* ++ at start and end of */  						/*  _rcu_barrier(). */ +	/* End of fields guarded by barrier_mutex. */ +  	unsigned long jiffies_force_qs;		/* Time at which to invoke */  						/*  force_quiescent_state(). */  	unsigned long n_force_qs;		/* Number of calls to */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c1774723643..2d8927fda71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void)  #ifdef CONFIG_SMP  #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0  #endif  void resched_task(struct task_struct *p) @@ -6122,6 +6122,17 @@ static void sched_init_numa(void)  	 * numbers.  	 */ +	/* +	 * Here, we should temporarily reset sched_domains_numa_levels to 0. +	 * If it fails to allocate memory for array sched_domains_numa_masks[][], +	 * the array will contain less then 'level' members. This could be +	 * dangerous when we use it to iterate array sched_domains_numa_masks[][] +	 * in other functions. +	 * +	 * We reset it to 'level' at the end of this function. +	 */ +	sched_domains_numa_levels = 0; +  	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);  	if (!sched_domains_numa_masks)  		return; @@ -6176,11 +6187,68 @@ static void sched_init_numa(void)  	}  	sched_domain_topology = tl; + +	sched_domains_numa_levels = level; +} + +static void sched_domains_numa_masks_set(int cpu) +{ +	int i, j; +	int node = cpu_to_node(cpu); + +	for (i = 0; i < sched_domains_numa_levels; i++) { +		for (j = 0; j < nr_node_ids; j++) { +			if (node_distance(j, node) <= sched_domains_numa_distance[i]) +				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); +		} +	} +} + +static void sched_domains_numa_masks_clear(int cpu) +{ +	int i, j; +	for (i = 0; i < sched_domains_numa_levels; i++) { +		for (j = 0; j < nr_node_ids; j++) +			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); +	} +} + +/* + * Update sched_domains_numa_masks[level][node] array when new cpus + * are onlined. + */ +static int sched_domains_numa_masks_update(struct notifier_block *nfb, +					   unsigned long action, +					   void *hcpu) +{ +	int cpu = (long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +		sched_domains_numa_masks_set(cpu); +		break; + +	case CPU_DEAD: +		sched_domains_numa_masks_clear(cpu); +		break; + +	default: +		return NOTIFY_DONE; +	} + +	return NOTIFY_OK;  }  #else  static inline void sched_init_numa(void)  {  } + +static int sched_domains_numa_masks_update(struct notifier_block *nfb, +					   unsigned long action, +					   void *hcpu) +{ +	return 0; +}  #endif /* CONFIG_NUMA */  static int __sdt_alloc(const struct cpumask *cpu_map) @@ -6629,6 +6697,7 @@ void __init sched_init_smp(void)  	mutex_unlock(&sched_domains_mutex);  	put_online_cpus(); +	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);  	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); diff --git a/kernel/time.c b/kernel/time.c index ba744cf8069..d226c6a3fd2 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@  #include <linux/export.h>  #include <linux/timex.h>  #include <linux/capability.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h>  #include <linux/errno.h>  #include <linux/syscalls.h>  #include <linux/security.h> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b7..8601f0db126 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA  config GENERIC_TIME_VSYSCALL  	bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD +	bool +  # ktime_t scalar 64bit nsec representation  config KTIME_SCALAR  	bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc..f11d83b1294 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@  static struct alarm_base {  	spinlock_t		lock;  	struct timerqueue_head	timerqueue; -	struct hrtimer		timer;  	ktime_t			(*gettime)(void);  	clockid_t		base_clockid;  } alarm_bases[ALARM_NUMTYPE]; @@ -46,6 +45,8 @@ static struct alarm_base {  static ktime_t freezer_delta;  static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; +  #ifdef CONFIG_RTC_CLASS  /* rtc timer and device for setting alarm wakeups at suspend */  static struct rtc_timer		rtctimer; @@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }   * @base: pointer to the base where the timer is being run   * @alarm: pointer to alarm being enqueued.   * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue   *   * Must hold base->lock when calling.   */  static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)  { +	if (alarm->state & ALARMTIMER_STATE_ENQUEUED) +		timerqueue_del(&base->timerqueue, &alarm->node); +  	timerqueue_add(&base->timerqueue, &alarm->node);  	alarm->state |= ALARMTIMER_STATE_ENQUEUED; - -	if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { -		hrtimer_try_to_cancel(&base->timer); -		hrtimer_start(&base->timer, alarm->node.expires, -				HRTIMER_MODE_ABS); -	}  }  /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue   * @base: pointer to the base where the timer is running   * @alarm: pointer to alarm being removed   * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue   *   * Must hold base->lock when calling.   */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)  { -	struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); -  	if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))  		return;  	timerqueue_del(&base->timerqueue, &alarm->node);  	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - -	if (next == &alarm->node) { -		hrtimer_try_to_cancel(&base->timer); -		next = timerqueue_getnext(&base->timerqueue); -		if (!next) -			return; -		hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); -	}  } @@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)   */  static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)  { -	struct alarm_base *base = container_of(timer, struct alarm_base, timer); -	struct timerqueue_node *next; +	struct alarm *alarm = container_of(timer, struct alarm, timer); +	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; -	ktime_t now;  	int ret = HRTIMER_NORESTART;  	int restart = ALARMTIMER_NORESTART;  	spin_lock_irqsave(&base->lock, flags); -	now = base->gettime(); -	while ((next = timerqueue_getnext(&base->timerqueue))) { -		struct alarm *alarm; -		ktime_t expired = next->expires; - -		if (expired.tv64 > now.tv64) -			break; - -		alarm = container_of(next, struct alarm, node); - -		timerqueue_del(&base->timerqueue, &alarm->node); -		alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - -		alarm->state |= ALARMTIMER_STATE_CALLBACK; -		spin_unlock_irqrestore(&base->lock, flags); -		if (alarm->function) -			restart = alarm->function(alarm, now); -		spin_lock_irqsave(&base->lock, flags); -		alarm->state &= ~ALARMTIMER_STATE_CALLBACK; +	alarmtimer_dequeue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); -		if (restart != ALARMTIMER_NORESTART) { -			timerqueue_add(&base->timerqueue, &alarm->node); -			alarm->state |= ALARMTIMER_STATE_ENQUEUED; -		} -	} +	if (alarm->function) +		restart = alarm->function(alarm, base->gettime()); -	if (next) { -		hrtimer_set_expires(&base->timer, next->expires); +	spin_lock_irqsave(&base->lock, flags); +	if (restart != ALARMTIMER_NORESTART) { +		hrtimer_set_expires(&alarm->timer, alarm->node.expires); +		alarmtimer_enqueue(base, alarm);  		ret = HRTIMER_RESTART;  	}  	spin_unlock_irqrestore(&base->lock, flags); @@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)  	unsigned long flags;  	struct rtc_device *rtc;  	int i; +	int ret;  	spin_lock_irqsave(&freezer_delta_lock, flags);  	min = freezer_delta; @@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)  	if (min.tv64 == 0)  		return 0; -	/* XXX - Should we enforce a minimum sleep time? */ -	WARN_ON(min.tv64 < NSEC_PER_SEC); +	if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { +		__pm_wakeup_event(ws, 2 * MSEC_PER_SEC); +		return -EBUSY; +	}  	/* Setup an rtc timer to fire that far in the future */  	rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)  	now = rtc_tm_to_ktime(tm);  	now = ktime_add(now, min); -	rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - -	return 0; +	/* Set alarm, if in the past reject suspend briefly to handle */ +	ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); +	if (ret < 0) +		__pm_wakeup_event(ws, MSEC_PER_SEC); +	return ret;  }  #else  static int alarmtimer_suspend(struct device *dev) @@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,  		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))  {  	timerqueue_init(&alarm->node); +	hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, +			HRTIMER_MODE_ABS); +	alarm->timer.function = alarmtimer_fired;  	alarm->function = function;  	alarm->type = type;  	alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,   * @alarm: ptr to alarm to set   * @start: time to run the alarm   */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start)  {  	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; +	int ret;  	spin_lock_irqsave(&base->lock, flags); -	if (alarmtimer_active(alarm)) -		alarmtimer_remove(base, alarm);  	alarm->node.expires = start;  	alarmtimer_enqueue(base, alarm); +	ret = hrtimer_start(&alarm->timer, alarm->node.expires, +				HRTIMER_MODE_ABS);  	spin_unlock_irqrestore(&base->lock, flags); +	return ret;  }  /** @@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)  {  	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; -	int ret = -1; -	spin_lock_irqsave(&base->lock, flags); - -	if (alarmtimer_callback_running(alarm)) -		goto out; +	int ret; -	if (alarmtimer_is_queued(alarm)) { -		alarmtimer_remove(base, alarm); -		ret = 1; -	} else -		ret = 0; -out: +	spin_lock_irqsave(&base->lock, flags); +	ret = hrtimer_try_to_cancel(&alarm->timer); +	if (ret >= 0) +		alarmtimer_dequeue(base, alarm);  	spin_unlock_irqrestore(&base->lock, flags);  	return ret;  } @@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)  	for (i = 0; i < ALARM_NUMTYPE; i++) {  		timerqueue_init_head(&alarm_bases[i].timerqueue);  		spin_lock_init(&alarm_bases[i].lock); -		hrtimer_init(&alarm_bases[i].timer, -				alarm_bases[i].base_clockid, -				HRTIMER_MODE_ABS); -		alarm_bases[i].timer.function = alarmtimer_fired;  	}  	error = alarmtimer_rtc_interface_setup(); @@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)  		error = PTR_ERR(pdev);  		goto out_drv;  	} +	ws = wakeup_source_register("alarmtimer");  	return 0;  out_drv: diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10..6629bf7b528 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@   * requested HZ value. It is also not recommended   * for "tick-less" systems.   */ -#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY	((NSEC_PER_SEC+HZ/2)/HZ)  /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier   * conversion, the .shift value could be zero. However @@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)  {  	return &clocksource_jiffies;  } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ +	u64 nsec_per_tick, shift_hz; +	long cycles_per_tick; + + + +	refined_jiffies = clocksource_jiffies; +	refined_jiffies.name = "refined-jiffies"; +	refined_jiffies.rating++; + +	/* Calc cycles per tick */ +	cycles_per_tick = (cycles_per_second + HZ/2)/HZ; +	/* shift_hz stores hz<<8 for extra accuracy */ +	shift_hz = (u64)cycles_per_second << 8; +	shift_hz += cycles_per_tick/2; +	do_div(shift_hz, cycles_per_tick); +	/* Calculate nsec_per_tick using shift_hz */ +	nsec_per_tick = (u64)NSEC_PER_SEC << 8; +	nsec_per_tick += (u32)shift_hz/2; +	do_div(nsec_per_tick, (u32)shift_hz); + +	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + +	clocksource_register(&refined_jiffies); +	return 0; +} diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f423bdd035c..a4026088526 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -835,7 +835,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)  		 */  		if (ts->tick_stopped) {  			touch_softlockup_watchdog(); -			if (idle_cpu(cpu)) +			if (is_idle_task(current))  				ts->idle_jiffies++;  		}  		update_process_times(user_mode(regs)); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5ce06a3fa91..e424970bb56 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@   *   */ +#include <linux/timekeeper_internal.h>  #include <linux/module.h>  #include <linux/interrupt.h>  #include <linux/percpu.h> @@ -21,61 +22,6 @@  #include <linux/tick.h>  #include <linux/stop_machine.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { -	/* Current clocksource used for timekeeping. */ -	struct clocksource	*clock; -	/* NTP adjusted clock multiplier */ -	u32			mult; -	/* The shift value of the current clocksource. */ -	u32			shift; -	/* Number of clock cycles in one NTP interval. */ -	cycle_t			cycle_interval; -	/* Number of clock shifted nano seconds in one NTP interval. */ -	u64			xtime_interval; -	/* shifted nano seconds left over when rounding cycle_interval */ -	s64			xtime_remainder; -	/* Raw nano seconds accumulated per NTP interval. */ -	u32			raw_interval; - -	/* Current CLOCK_REALTIME time in seconds */ -	u64			xtime_sec; -	/* Clock shifted nano seconds */ -	u64			xtime_nsec; - -	/* Difference between accumulated time and NTP time in ntp -	 * shifted nano seconds. */ -	s64			ntp_error; -	/* Shift conversion between clock shifted nano seconds and -	 * ntp shifted nano seconds. */ -	u32			ntp_error_shift; - -	/* -	 * wall_to_monotonic is what we need to add to xtime (or xtime corrected -	 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged -	 * at zero at system boot time, so wall_to_monotonic will be negative, -	 * however, we will ALWAYS keep the tv_nsec part positive so we can use -	 * the usual normalization. -	 * -	 * wall_to_monotonic is moved after resume from suspend for the -	 * monotonic time not to jump. We need to add total_sleep_time to -	 * wall_to_monotonic to get the real boot based time offset. -	 * -	 * - wall_to_monotonic is no longer the boot time, getboottime must be -	 * used instead. -	 */ -	struct timespec		wall_to_monotonic; -	/* Offset clock monotonic -> clock realtime */ -	ktime_t			offs_real; -	/* time spent in suspend */ -	struct timespec		total_sleep_time; -	/* Offset clock monotonic -> clock boottime */ -	ktime_t			offs_boot; -	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -	struct timespec		raw_time; -	/* Seqlock for all timekeeper values */ -	seqlock_t		lock; -};  static struct timekeeper timekeeper; @@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)  	}  } -static struct timespec tk_xtime(struct timekeeper *tk) -{ -	struct timespec ts; - -	ts.tv_sec = tk->xtime_sec; -	ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); -	return ts; -} -  static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)  {  	tk->xtime_sec = ts->tv_sec; @@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  /* must hold write on timekeeper.lock */  static void timekeeping_update(struct timekeeper *tk, bool clearntp)  { -	struct timespec xt; -  	if (clearntp) {  		tk->ntp_error = 0;  		ntp_clear();  	} -	xt = tk_xtime(tk); -	update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); +	update_vsyscall(tk);  }  /** @@ -1113,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	accumulate_nsecs_to_secs(tk);  	/* Accumulate raw time */ -	raw_nsecs = tk->raw_interval << shift; +	raw_nsecs = (u64)tk->raw_interval << shift;  	raw_nsecs += tk->raw_time.tv_nsec;  	if (raw_nsecs >= NSEC_PER_SEC) {  		u64 raw_secs = raw_nsecs; @@ -1130,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	return offset;  } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ +	s64 remainder; + +	/* +	* Store only full nanoseconds into xtime_nsec after rounding +	* it up and add the remainder to the error difference. +	* XXX - This is necessary to avoid small 1ns inconsistnecies caused +	* by truncating the remainder in vsyscalls. However, it causes +	* additional work to be done in timekeeping_adjust(). Once +	* the vsyscall implementations are converted to use xtime_nsec +	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD +	* users are removed, this can be killed. +	*/ +	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); +	tk->xtime_nsec -= remainder; +	tk->xtime_nsec += 1ULL << tk->shift; +	tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + +  /**   * update_wall_time - Uses the current clocksource to increment the wall time   * @@ -1141,7 +1102,6 @@ static void update_wall_time(void)  	cycle_t offset;  	int shift = 0, maxshift;  	unsigned long flags; -	s64 remainder;  	write_seqlock_irqsave(&tk->lock, flags); @@ -1183,20 +1143,11 @@ static void update_wall_time(void)  	/* correct the clock when NTP error is too big */  	timekeeping_adjust(tk, offset); -  	/* -	* Store only full nanoseconds into xtime_nsec after rounding -	* it up and add the remainder to the error difference. -	* XXX - This is necessary to avoid small 1ns inconsistnecies caused -	* by truncating the remainder in vsyscalls. However, it causes -	* additional work to be done in timekeeping_adjust(). Once -	* the vsyscall implementations are converted to use xtime_nsec -	* (shifted nanoseconds), this can be killed. -	*/ -	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); -	tk->xtime_nsec -= remainder; -	tk->xtime_nsec += 1ULL << tk->shift; -	tk->ntp_error += remainder << tk->ntp_error_shift; +	 * XXX This can be killed once everyone converts +	 * to the new update_vsyscall. +	 */ +	old_vsyscall_fixup(tk);  	/*  	 * Finally, make sure that after the rounding diff --git a/kernel/timer.c b/kernel/timer.c index d5de1b2292a..367d0085848 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);  #define TVR_SIZE (1 << TVR_BITS)  #define TVN_MASK (TVN_SIZE - 1)  #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))  struct tvec {  	struct list_head vec[TVN_SIZE]; @@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)  		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);  	} else {  		int i; -		/* If the timeout is larger than 0xffffffff on 64-bit -		 * architectures then we use the maximum timeout: +		/* If the timeout is larger than MAX_TVAL (on 64-bit +		 * architectures or with CONFIG_BASE_SMALL=1) then we +		 * use the maximum timeout.  		 */ -		if (idx > 0xffffffffUL) { -			idx = 0xffffffffUL; +		if (idx > MAX_TVAL) { +			idx = MAX_TVAL;  			expires = idx + base->timer_jiffies;  		}  		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;  |