diff options
Diffstat (limited to 'kernel')
65 files changed, 3662 insertions, 1949 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 5404911eaee..86e3285ae7e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o  obj-$(CONFIG_PROVE_LOCKING) += spinlock.o  obj-$(CONFIG_UID16) += uid16.o  obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o  obj-$(CONFIG_KALLSYMS) += kallsyms.o  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o  obj-$(CONFIG_KEXEC) += kexec.o @@ -130,3 +131,77 @@ quiet_cmd_timeconst  = TIMEC   $@  targets += timeconst.h  $(obj)/timeconst.h: $(src)/timeconst.pl FORCE  	$(call if_changed,timeconst) + +ifeq ($(CONFIG_MODULE_SIG),y) +# +# Pull the signing certificate and any extra certificates into the kernel +# +extra_certificates: +	touch $@ + +kernel/modsign_pubkey.o: signing_key.x509 extra_certificates + +############################################################################### +# +# If module signing is requested, say by allyesconfig, but a key has not been +# supplied, then one will need to be generated to make sure the build does not +# fail and that the kernel may be used afterwards. +# +############################################################################### +sign_key_with_hash := +ifeq ($(CONFIG_MODULE_SIG_SHA1),y) +sign_key_with_hash := -sha1 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA224),y) +sign_key_with_hash := -sha224 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA256),y) +sign_key_with_hash := -sha256 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA384),y) +sign_key_with_hash := -sha384 +endif +ifeq ($(CONFIG_MODULE_SIG_SHA512),y) +sign_key_with_hash := -sha512 +endif +ifeq ($(sign_key_with_hash),) +$(error Could not determine digest type to use from kernel config) +endif + +signing_key.priv signing_key.x509: x509.genkey +	@echo "###" +	@echo "### Now generating an X.509 key pair to be used for signing modules." +	@echo "###" +	@echo "### If this takes a long time, you might wish to run rngd in the" +	@echo "### background to keep the supply of entropy topped up.  It" +	@echo "### needs to be run as root, and uses a hardware random" +	@echo "### number generator if one is available." +	@echo "###" +	openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ +		-x509 -config x509.genkey \ +		-outform DER -out signing_key.x509 \ +		-keyout signing_key.priv +	@echo "###" +	@echo "### Key pair generated." +	@echo "###" + +x509.genkey: +	@echo Generating X.509 key generation config +	@echo  >x509.genkey "[ req ]" +	@echo >>x509.genkey "default_bits = 4096" +	@echo >>x509.genkey "distinguished_name = req_distinguished_name" +	@echo >>x509.genkey "prompt = no" +	@echo >>x509.genkey "string_mask = utf8only" +	@echo >>x509.genkey "x509_extensions = myexts" +	@echo >>x509.genkey +	@echo >>x509.genkey "[ req_distinguished_name ]" +	@echo >>x509.genkey "O = Magrathea" +	@echo >>x509.genkey "CN = Glacier signing key" +	@echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2" +	@echo >>x509.genkey +	@echo >>x509.genkey "[ myexts ]" +	@echo >>x509.genkey "basicConstraints=critical,CA:FALSE" +	@echo >>x509.genkey "keyUsage=digitalSignature" +	@echo >>x509.genkey "subjectKeyIdentifier=hash" +	@echo >>x509.genkey "authorityKeyIdentifier=keyid" +endif diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b..051e071a06e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -193,7 +193,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,  	}  } -static int acct_on(char *name) +static int acct_on(struct filename *pathname)  {  	struct file *file;  	struct vfsmount *mnt; @@ -201,7 +201,7 @@ static int acct_on(char *name)  	struct bsd_acct_struct *acct = NULL;  	/* Difference from BSD - they don't do O_APPEND */ -	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); +	file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);  	if (IS_ERR(file))  		return PTR_ERR(file); @@ -260,7 +260,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)  		return -EPERM;  	if (name) { -		char *tmp = getname(name); +		struct filename *tmp = getname(name);  		if (IS_ERR(tmp))  			return (PTR_ERR(tmp));  		error = acct_on(tmp); @@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,  	do_div(elapsed, AHZ);  	ac.ac_btime = get_seconds() - elapsed;  	/* we really need to bite the bullet and change layout */ -	ac.ac_uid = orig_cred->uid; -	ac.ac_gid = orig_cred->gid; +	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); +	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);  #if ACCT_VERSION==2  	ac.ac_ahz = AHZ;  #endif diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c..40414e9143d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -61,6 +61,7 @@  #include <linux/netlink.h>  #include <linux/freezer.h>  #include <linux/tty.h> +#include <linux/pid_namespace.h>  #include "audit.h" @@ -87,11 +88,11 @@ static int	audit_failure = AUDIT_FAIL_PRINTK;  /*   * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. + * contains the pid of the auditd process and audit_nlk_portid contains + * the portid to use to send netlink messages to that process.   */  int		audit_pid; -static int	audit_nlk_pid; +static int	audit_nlk_portid;  /* If audit_rate_limit is non-zero, limit the rate of sending audit records   * to that number per second.  This prevents DoS attacks, but results in @@ -104,7 +105,7 @@ static int	audit_backlog_wait_time = 60 * HZ;  static int	audit_backlog_wait_overflow = 0;  /* The identity of the user shutting down the audit system. */ -uid_t		audit_sig_uid = -1; +kuid_t		audit_sig_uid = INVALID_UID;  pid_t		audit_sig_pid = -1;  u32		audit_sig_sid = 0; @@ -264,7 +265,7 @@ void audit_log_lost(const char *message)  }  static int audit_log_config_change(char *function_name, int new, int old, -				   uid_t loginuid, u32 sessionid, u32 sid, +				   kuid_t loginuid, u32 sessionid, u32 sid,  				   int allow_changes)  {  	struct audit_buffer *ab; @@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old,  	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);  	audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, -			 old, loginuid, sessionid); +			 old, from_kuid(&init_user_ns, loginuid), sessionid);  	if (sid) {  		char *ctx = NULL;  		u32 len; @@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old,  }  static int audit_do_config_change(char *function_name, int *to_change, -				  int new, uid_t loginuid, u32 sessionid, +				  int new, kuid_t loginuid, u32 sessionid,  				  u32 sid)  {  	int allow_changes, rc = 0, old = *to_change; @@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change,  	return rc;  } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,  				u32 sid)  {  	return audit_do_config_change("audit_rate_limit", &audit_rate_limit,  				      limit, loginuid, sessionid, sid);  } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, +static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,  				   u32 sid)  {  	return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,  				      limit, loginuid, sessionid, sid);  } -static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)  {  	int rc;  	if (state < AUDIT_OFF || state > AUDIT_LOCKED) @@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid)  	return rc;  } -static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)  {  	if (state != AUDIT_FAIL_SILENT  	    && state != AUDIT_FAIL_PRINTK @@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb)  	int err;  	/* take a reference in case we can't send it and we want to hold it */  	skb_get(skb); -	err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); +	err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);  	if (err < 0) {  		BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */  		printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy)  	return 0;  } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) -{ -	struct task_struct *tsk; -	int err; - -	rcu_read_lock(); -	tsk = find_task_by_vpid(pid); -	if (!tsk) { -		rcu_read_unlock(); -		return -ESRCH; -	} -	get_task_struct(tsk); -	rcu_read_unlock(); -	err = tty_audit_push_task(tsk, loginuid, sessionid); -	put_task_struct(tsk); -	return err; -} -  int audit_send_list(void *_dest)  {  	struct audit_netlink_list *dest = _dest; @@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)  {  	int err = 0; +	/* Only support the initial namespaces for now. */ +	if ((current_user_ns() != &init_user_ns) || +	    (task_active_pid_ns(current) != &init_pid_ns)) +		return -EPERM; +  	switch (msg_type) {  	case AUDIT_GET:  	case AUDIT_LIST: @@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)  }  static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, -				     u32 pid, u32 uid, uid_t auid, u32 ses, -				     u32 sid) +				     kuid_t auid, u32 ses, u32 sid)  {  	int rc = 0;  	char *ctx = NULL; @@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,  	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);  	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", -			 pid, uid, auid, ses); +			 task_tgid_vnr(current), +			 from_kuid(&init_user_ns, current_uid()), +			 from_kuid(&init_user_ns, auid), ses);  	if (sid) {  		rc = security_secid_to_secctx(sid, &ctx, &len);  		if (rc) @@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,  static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  { -	u32			uid, pid, seq, sid; +	u32			seq, sid;  	void			*data;  	struct audit_status	*status_get, status_set;  	int			err;  	struct audit_buffer	*ab;  	u16			msg_type = nlh->nlmsg_type; -	uid_t			loginuid; /* loginuid of sender */ +	kuid_t			loginuid; /* loginuid of sender */  	u32			sessionid;  	struct audit_sig_info   *sig_data;  	char			*ctx = NULL; @@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		return err;  	} -	pid  = NETLINK_CREDS(skb)->pid; -	uid  = NETLINK_CREDS(skb)->uid;  	loginuid = audit_get_loginuid(current);  	sessionid = audit_get_sessionid(current);  	security_task_getsecid(current, &sid); @@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		status_set.backlog_limit = audit_backlog_limit;  		status_set.lost		 = atomic_read(&audit_lost);  		status_set.backlog	 = skb_queue_len(&audit_skb_queue); -		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, +		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,  				 &status_set, sizeof(status_set));  		break;  	case AUDIT_SET: @@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  							sessionid, sid, 1);  			audit_pid = new_pid; -			audit_nlk_pid = NETLINK_CB(skb).pid; +			audit_nlk_portid = NETLINK_CB(skb).portid;  		}  		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {  			err = audit_set_rate_limit(status_get->rate_limit, @@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		if (!audit_enabled && msg_type != AUDIT_USER_AVC)  			return 0; -		err = audit_filter_user(&NETLINK_CB(skb)); +		err = audit_filter_user();  		if (err == 1) {  			err = 0;  			if (msg_type == AUDIT_USER_TTY) { -				err = audit_prepare_user_tty(pid, loginuid, +				err = tty_audit_push_task(current, loginuid,  							     sessionid);  				if (err)  					break;  			} -			audit_log_common_recv_msg(&ab, msg_type, pid, uid, +			audit_log_common_recv_msg(&ab, msg_type,  						  loginuid, sessionid, sid);  			if (msg_type != AUDIT_USER_TTY) @@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  					size--;  				audit_log_n_untrustedstring(ab, data, size);  			} -			audit_set_pid(ab, pid); +			audit_set_pid(ab, NETLINK_CB(skb).portid);  			audit_log_end(ab);  		}  		break; @@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		if (nlmsg_len(nlh) < sizeof(struct audit_rule))  			return -EINVAL;  		if (audit_enabled == AUDIT_LOCKED) { -			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, -						  uid, loginuid, sessionid, sid); +			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, +						  loginuid, sessionid, sid);  			audit_log_format(ab, " audit_enabled=%d res=0",  					 audit_enabled); @@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		}  		/* fallthrough */  	case AUDIT_LIST: -		err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, -					   uid, seq, data, nlmsg_len(nlh), +		err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, +					   seq, data, nlmsg_len(nlh),  					   loginuid, sessionid, sid);  		break;  	case AUDIT_ADD_RULE: @@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))  			return -EINVAL;  		if (audit_enabled == AUDIT_LOCKED) { -			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, -						  uid, loginuid, sessionid, sid); +			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, +						  loginuid, sessionid, sid);  			audit_log_format(ab, " audit_enabled=%d res=0",  					 audit_enabled); @@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		}  		/* fallthrough */  	case AUDIT_LIST_RULES: -		err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, -					   uid, seq, data, nlmsg_len(nlh), +		err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, +					   seq, data, nlmsg_len(nlh),  					   loginuid, sessionid, sid);  		break;  	case AUDIT_TRIM:  		audit_trim_trees(); -		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, -					  uid, loginuid, sessionid, sid); +		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, +					  loginuid, sessionid, sid);  		audit_log_format(ab, " op=trim res=1");  		audit_log_end(ab); @@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		/* OK, here comes... */  		err = audit_tag_tree(old, new); -		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, -					  uid, loginuid, sessionid, sid); +		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, +					  loginuid, sessionid, sid);  		audit_log_format(ab, " op=make_equiv old=");  		audit_log_untrustedstring(ab, old); @@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  				security_release_secctx(ctx, len);  			return -ENOMEM;  		} -		sig_data->uid = audit_sig_uid; +		sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);  		sig_data->pid = audit_sig_pid;  		if (audit_sig_sid) {  			memcpy(sig_data->ctx, ctx, len);  			security_release_secctx(ctx, len);  		} -		audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, +		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,  				0, 0, sig_data, sizeof(*sig_data) + len);  		kfree(sig_data);  		break;  	case AUDIT_TTY_GET: {  		struct audit_tty_status s; -		struct task_struct *tsk; -		unsigned long flags; +		struct task_struct *tsk = current; -		rcu_read_lock(); -		tsk = find_task_by_vpid(pid); -		if (tsk && lock_task_sighand(tsk, &flags)) { -			s.enabled = tsk->signal->audit_tty != 0; -			unlock_task_sighand(tsk, &flags); -		} else -			err = -ESRCH; -		rcu_read_unlock(); +		spin_lock_irq(&tsk->sighand->siglock); +		s.enabled = tsk->signal->audit_tty != 0; +		spin_unlock_irq(&tsk->sighand->siglock); -		if (!err) -			audit_send_reply(NETLINK_CB(skb).pid, seq, -					 AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); +		audit_send_reply(NETLINK_CB(skb).portid, seq, +				 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));  		break;  	}  	case AUDIT_TTY_SET: {  		struct audit_tty_status *s; -		struct task_struct *tsk; -		unsigned long flags; +		struct task_struct *tsk = current;  		if (nlh->nlmsg_len < sizeof(struct audit_tty_status))  			return -EINVAL;  		s = data;  		if (s->enabled != 0 && s->enabled != 1)  			return -EINVAL; -		rcu_read_lock(); -		tsk = find_task_by_vpid(pid); -		if (tsk && lock_task_sighand(tsk, &flags)) { -			tsk->signal->audit_tty = s->enabled != 0; -			unlock_task_sighand(tsk, &flags); -		} else -			err = -ESRCH; -		rcu_read_unlock(); + +		spin_lock_irq(&tsk->sighand->siglock); +		tsk->signal->audit_tty = s->enabled != 0; +		spin_unlock_irq(&tsk->sighand->siglock);  		break;  	}  	default: @@ -971,8 +946,7 @@ static int __init audit_init(void)  	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",  	       audit_default ? "enabled" : "disabled"); -	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, -					   THIS_MODULE, &cfg); +	audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg);  	if (!audit_sock)  		audit_panic("cannot initialize netlink socket");  	else @@ -1466,6 +1440,8 @@ void audit_log_link_denied(const char *operation, struct path *link)  	ab = audit_log_start(current->audit_context, GFP_KERNEL,  			     AUDIT_ANOM_LINK); +	if (!ab) +		return;  	audit_log_format(ab, "op=%s action=denied", operation);  	audit_log_format(ab, " pid=%d comm=", current->pid);  	audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/audit.h b/kernel/audit.h index 81676680337..d51cba868e1 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,10 +74,15 @@ static inline int audit_hash_ino(u32 ino)  	return (ino & (AUDIT_INODE_BUCKETS-1));  } +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 +  extern int audit_match_class(int class, unsigned syscall);  extern int audit_comparator(const u32 left, const u32 op, const u32 right); -extern int audit_compare_dname_path(const char *dname, const char *path, -				    int *dirlen); +extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); +extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); +extern int parent_len(const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, int plen);  extern struct sk_buff *	    audit_make_reply(int pid, int seq, int type,  					     int done, int multi,  					     const void *payload, int size); @@ -144,7 +149,7 @@ extern void audit_kill_trees(struct list_head *);  extern char *audit_unpack_string(void **, size_t *, size_t);  extern pid_t audit_sig_pid; -extern uid_t audit_sig_uid; +extern kuid_t audit_sig_uid;  extern u32 audit_sig_sid;  #ifdef CONFIG_AUDITSYSCALL diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b..9a9ae6e3d29 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc  		struct audit_buffer *ab;  		ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);  		audit_log_format(ab, "auid=%u ses=%u op=", -				 audit_get_loginuid(current), +				 from_kuid(&init_user_ns, audit_get_loginuid(current)),  				 audit_get_sessionid(current));  		audit_log_string(ab, op);  		audit_log_format(ab, " path="); @@ -265,7 +265,8 @@ static void audit_update_watch(struct audit_parent *parent,  	/* Run all of the watches on this parent looking for the one that  	 * matches the given dname */  	list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { -		if (audit_compare_dname_path(dname, owatch->path, NULL)) +		if (audit_compare_dname_path(dname, owatch->path, +					     AUDIT_NAME_FULL))  			continue;  		/* If the update involves invalidating rules, do the inode-based diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd20..7f19f23d38a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)  		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);  		f->val = rule->values[i]; +		f->uid = INVALID_UID; +		f->gid = INVALID_GID;  		err = -EINVAL;  		if (f->op == Audit_bad) @@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)  		switch(f->type) {  		default:  			goto exit_free; -		case AUDIT_PID:  		case AUDIT_UID:  		case AUDIT_EUID:  		case AUDIT_SUID:  		case AUDIT_FSUID: +		case AUDIT_LOGINUID: +			/* bit ops not implemented for uid comparisons */ +			if (f->op == Audit_bitmask || f->op == Audit_bittest) +				goto exit_free; + +			f->uid = make_kuid(current_user_ns(), f->val); +			if (!uid_valid(f->uid)) +				goto exit_free; +			break;  		case AUDIT_GID:  		case AUDIT_EGID:  		case AUDIT_SGID:  		case AUDIT_FSGID: -		case AUDIT_LOGINUID: +			/* bit ops not implemented for gid comparisons */ +			if (f->op == Audit_bitmask || f->op == Audit_bittest) +				goto exit_free; + +			f->gid = make_kgid(current_user_ns(), f->val); +			if (!gid_valid(f->gid)) +				goto exit_free; +			break; +		case AUDIT_PID:  		case AUDIT_PERS:  		case AUDIT_MSGTYPE:  		case AUDIT_PPID: @@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,  		f->type = data->fields[i];  		f->val = data->values[i]; +		f->uid = INVALID_UID; +		f->gid = INVALID_GID;  		f->lsm_str = NULL;  		f->lsm_rule = NULL;  		switch(f->type) { -		case AUDIT_PID:  		case AUDIT_UID:  		case AUDIT_EUID:  		case AUDIT_SUID:  		case AUDIT_FSUID: +		case AUDIT_LOGINUID: +		case AUDIT_OBJ_UID: +			/* bit ops not implemented for uid comparisons */ +			if (f->op == Audit_bitmask || f->op == Audit_bittest) +				goto exit_free; + +			f->uid = make_kuid(current_user_ns(), f->val); +			if (!uid_valid(f->uid)) +				goto exit_free; +			break;  		case AUDIT_GID:  		case AUDIT_EGID:  		case AUDIT_SGID:  		case AUDIT_FSGID: -		case AUDIT_LOGINUID: +		case AUDIT_OBJ_GID: +			/* bit ops not implemented for gid comparisons */ +			if (f->op == Audit_bitmask || f->op == Audit_bittest) +				goto exit_free; + +			f->gid = make_kgid(current_user_ns(), f->val); +			if (!gid_valid(f->gid)) +				goto exit_free; +			break; +		case AUDIT_PID:  		case AUDIT_PERS:  		case AUDIT_MSGTYPE:  		case AUDIT_PPID: @@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,  		case AUDIT_ARG1:  		case AUDIT_ARG2:  		case AUDIT_ARG3: -		case AUDIT_OBJ_UID: -		case AUDIT_OBJ_GID:  			break;  		case AUDIT_ARCH:  			entry->rule.arch_f = f; @@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)  			if (strcmp(a->filterkey, b->filterkey))  				return 1;  			break; +		case AUDIT_UID: +		case AUDIT_EUID: +		case AUDIT_SUID: +		case AUDIT_FSUID: +		case AUDIT_LOGINUID: +		case AUDIT_OBJ_UID: +			if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) +				return 1; +			break; +		case AUDIT_GID: +		case AUDIT_EGID: +		case AUDIT_SGID: +		case AUDIT_FSGID: +		case AUDIT_OBJ_GID: +			if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) +				return 1; +			break;  		default:  			if (a->fields[i].val != b->fields[i].val)  				return 1; @@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)  }  /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, +static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,  				  char *action, struct audit_krule *rule,  				  int res)  { @@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,  	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);  	if (!ab)  		return; -	audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); +	audit_log_format(ab, "auid=%u ses=%u", +			 from_kuid(&init_user_ns, loginuid), sessionid);  	if (sid) {  		char *ctx = NULL;  		u32 len; @@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,   * @sessionid: sessionid for netlink audit message   * @sid: SE Linux Security ID of sender   */ -int audit_receive_filter(int type, int pid, int uid, int seq, void *data, -			 size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) +int audit_receive_filter(int type, int pid, int seq, void *data, +			 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)  {  	struct task_struct *tsk;  	struct audit_netlink_list *dest; @@ -1198,46 +1252,110 @@ int audit_comparator(u32 left, u32 op, u32 right)  	}  } -/* Compare given dentry name with last component in given path, - * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path, -			     int *dirlen) +int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)  { -	int dlen, plen; -	const char *p; +	switch (op) { +	case Audit_equal: +		return uid_eq(left, right); +	case Audit_not_equal: +		return !uid_eq(left, right); +	case Audit_lt: +		return uid_lt(left, right); +	case Audit_le: +		return uid_lte(left, right); +	case Audit_gt: +		return uid_gt(left, right); +	case Audit_ge: +		return uid_gte(left, right); +	case Audit_bitmask: +	case Audit_bittest: +	default: +		BUG(); +		return 0; +	} +} -	if (!dname || !path) -		return 1; +int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) +{ +	switch (op) { +	case Audit_equal: +		return gid_eq(left, right); +	case Audit_not_equal: +		return !gid_eq(left, right); +	case Audit_lt: +		return gid_lt(left, right); +	case Audit_le: +		return gid_lte(left, right); +	case Audit_gt: +		return gid_gt(left, right); +	case Audit_ge: +		return gid_gte(left, right); +	case Audit_bitmask: +	case Audit_bittest: +	default: +		BUG(); +		return 0; +	} +} + +/** + * parent_len - find the length of the parent portion of a pathname + * @path: pathname of which to determine length + */ +int parent_len(const char *path) +{ +	int plen; +	const char *p; -	dlen = strlen(dname);  	plen = strlen(path); -	if (plen < dlen) -		return 1; + +	if (plen == 0) +		return plen;  	/* disregard trailing slashes */  	p = path + plen - 1;  	while ((*p == '/') && (p > path))  		p--; -	/* find last path component */ -	p = p - dlen + 1; -	if (p < path) +	/* walk backward until we find the next slash or hit beginning */ +	while ((*p != '/') && (p > path)) +		p--; + +	/* did we find a slash? Then increment to include it in path */ +	if (*p == '/') +		p++; + +	return p - path; +} + +/** + * audit_compare_dname_path - compare given dentry name with last component in + * 			      given path. Return of 0 indicates a match. + * @dname:	dentry name that we're comparing + * @path:	full pathname that we're comparing + * @parentlen:	length of the parent if known. Passing in AUDIT_NAME_FULL + * 		here indicates that we must compute this value. + */ +int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +{ +	int dlen, pathlen; +	const char *p; + +	dlen = strlen(dname); +	pathlen = strlen(path); +	if (pathlen < dlen)  		return 1; -	else if (p > path) { -		if (*--p != '/') -			return 1; -		else -			p++; -	} -	/* return length of path's directory component */ -	if (dirlen) -		*dirlen = p - path; +	parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen; +	if (pathlen - parentlen != dlen) +		return 1; + +	p = path + parentlen; +  	return strncmp(p, dname, dlen);  } -static int audit_filter_user_rules(struct netlink_skb_parms *cb, -				   struct audit_krule *rule, +static int audit_filter_user_rules(struct audit_krule *rule,  				   enum audit_state *state)  {  	int i; @@ -1249,17 +1367,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,  		switch (f->type) {  		case AUDIT_PID: -			result = audit_comparator(cb->creds.pid, f->op, f->val); +			result = audit_comparator(task_pid_vnr(current), f->op, f->val);  			break;  		case AUDIT_UID: -			result = audit_comparator(cb->creds.uid, f->op, f->val); +			result = audit_uid_comparator(current_uid(), f->op, f->uid);  			break;  		case AUDIT_GID: -			result = audit_comparator(cb->creds.gid, f->op, f->val); +			result = audit_gid_comparator(current_gid(), f->op, f->gid);  			break;  		case AUDIT_LOGINUID: -			result = audit_comparator(audit_get_loginuid(current), -						  f->op, f->val); +			result = audit_uid_comparator(audit_get_loginuid(current), +						  f->op, f->uid);  			break;  		case AUDIT_SUBJ_USER:  		case AUDIT_SUBJ_ROLE: @@ -1287,7 +1405,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,  	return 1;  } -int audit_filter_user(struct netlink_skb_parms *cb) +int audit_filter_user(void)  {  	enum audit_state state = AUDIT_DISABLED;  	struct audit_entry *e; @@ -1295,7 +1413,7 @@ int audit_filter_user(struct netlink_skb_parms *cb)  	rcu_read_lock();  	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { -		if (audit_filter_user_rules(cb, &e->rule, &state)) { +		if (audit_filter_user_rules(&e->rule, &state)) {  			if (state == AUDIT_DISABLED)  				ret = 0;  			break; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b..2f186ed80c4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -81,9 +81,6 @@   * a name dynamically and also add those to the list anchored by names_list. */  #define AUDIT_NAMES	5 -/* Indicates that audit should log the full pathname. */ -#define AUDIT_NAME_FULL -1 -  /* no execve audit message should be longer than this (userspace limits) */  #define MAX_EXECVE_AUDIT_LEN 7500 @@ -106,27 +103,29 @@ struct audit_cap_data {   * we don't let putname() free it (instead we free all of the saved   * pointers at syscall exit time).   * - * Further, in fs/namei.c:path_lookup() we store the inode and device. */ + * Further, in fs/namei.c:path_lookup() we store the inode and device. + */  struct audit_names { -	struct list_head list;		/* audit_context->names_list */ -	const char	*name; -	unsigned long	ino; -	dev_t		dev; -	umode_t		mode; -	uid_t		uid; -	gid_t		gid; -	dev_t		rdev; -	u32		osid; -	struct audit_cap_data fcap; -	unsigned int	fcap_ver; -	int		name_len;	/* number of name's characters to log */ -	bool		name_put;	/* call __putname() for this name */ +	struct list_head	list;		/* audit_context->names_list */ +	struct filename	*name; +	unsigned long		ino; +	dev_t			dev; +	umode_t			mode; +	kuid_t			uid; +	kgid_t			gid; +	dev_t			rdev; +	u32			osid; +	struct audit_cap_data	 fcap; +	unsigned int		fcap_ver; +	int			name_len;	/* number of name's characters to log */ +	unsigned char		type;		/* record type */ +	bool			name_put;	/* call __putname() for this name */  	/*  	 * This was an allocated audit_names and not from the array of  	 * names allocated in the task audit context.  Thus this name  	 * should be freed on syscall exit  	 */ -	bool		should_free; +	bool			should_free;  };  struct audit_aux_data { @@ -149,8 +148,8 @@ struct audit_aux_data_execve {  struct audit_aux_data_pids {  	struct audit_aux_data	d;  	pid_t			target_pid[AUDIT_AUX_PIDS]; -	uid_t			target_auid[AUDIT_AUX_PIDS]; -	uid_t			target_uid[AUDIT_AUX_PIDS]; +	kuid_t			target_auid[AUDIT_AUX_PIDS]; +	kuid_t			target_uid[AUDIT_AUX_PIDS];  	unsigned int		target_sessionid[AUDIT_AUX_PIDS];  	u32			target_sid[AUDIT_AUX_PIDS];  	char 			target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; @@ -208,14 +207,14 @@ struct audit_context {  	size_t sockaddr_len;  				/* Save things to print about task_struct */  	pid_t		    pid, ppid; -	uid_t		    uid, euid, suid, fsuid; -	gid_t		    gid, egid, sgid, fsgid; +	kuid_t		    uid, euid, suid, fsuid; +	kgid_t		    gid, egid, sgid, fsgid;  	unsigned long	    personality;  	int		    arch;  	pid_t		    target_pid; -	uid_t		    target_auid; -	uid_t		    target_uid; +	kuid_t		    target_auid; +	kuid_t		    target_uid;  	unsigned int	    target_sessionid;  	u32		    target_sid;  	char		    target_comm[TASK_COMM_LEN]; @@ -231,8 +230,8 @@ struct audit_context {  			long args[6];  		} socketcall;  		struct { -			uid_t			uid; -			gid_t			gid; +			kuid_t			uid; +			kgid_t			gid;  			umode_t			mode;  			u32			osid;  			int			has_perm; @@ -464,37 +463,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)  	return 0;  } -static int audit_compare_id(uid_t uid1, -			    struct audit_names *name, -			    unsigned long name_offset, -			    struct audit_field *f, -			    struct audit_context *ctx) +static int audit_compare_uid(kuid_t uid, +			     struct audit_names *name, +			     struct audit_field *f, +			     struct audit_context *ctx)  {  	struct audit_names *n; -	unsigned long addr; -	uid_t uid2;  	int rc; - -	BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); - +   	if (name) { -		addr = (unsigned long)name; -		addr += name_offset; - -		uid2 = *(uid_t *)addr; -		rc = audit_comparator(uid1, f->op, uid2); +		rc = audit_uid_comparator(uid, f->op, name->uid);  		if (rc)  			return rc;  	} - +   	if (ctx) {  		list_for_each_entry(n, &ctx->names_list, list) { -			addr = (unsigned long)n; -			addr += name_offset; - -			uid2 = *(uid_t *)addr; +			rc = audit_uid_comparator(uid, f->op, n->uid); +			if (rc) +				return rc; +		} +	} +	return 0; +} -			rc = audit_comparator(uid1, f->op, uid2); +static int audit_compare_gid(kgid_t gid, +			     struct audit_names *name, +			     struct audit_field *f, +			     struct audit_context *ctx) +{ +	struct audit_names *n; +	int rc; +  +	if (name) { +		rc = audit_gid_comparator(gid, f->op, name->gid); +		if (rc) +			return rc; +	} +  +	if (ctx) { +		list_for_each_entry(n, &ctx->names_list, list) { +			rc = audit_gid_comparator(gid, f->op, n->gid);  			if (rc)  				return rc;  		} @@ -511,80 +520,62 @@ static int audit_field_compare(struct task_struct *tsk,  	switch (f->val) {  	/* process to file object comparisons */  	case AUDIT_COMPARE_UID_TO_OBJ_UID: -		return audit_compare_id(cred->uid, -					name, offsetof(struct audit_names, uid), -					f, ctx); +		return audit_compare_uid(cred->uid, name, f, ctx);  	case AUDIT_COMPARE_GID_TO_OBJ_GID: -		return audit_compare_id(cred->gid, -					name, offsetof(struct audit_names, gid), -					f, ctx); +		return audit_compare_gid(cred->gid, name, f, ctx);  	case AUDIT_COMPARE_EUID_TO_OBJ_UID: -		return audit_compare_id(cred->euid, -					name, offsetof(struct audit_names, uid), -					f, ctx); +		return audit_compare_uid(cred->euid, name, f, ctx);  	case AUDIT_COMPARE_EGID_TO_OBJ_GID: -		return audit_compare_id(cred->egid, -					name, offsetof(struct audit_names, gid), -					f, ctx); +		return audit_compare_gid(cred->egid, name, f, ctx);  	case AUDIT_COMPARE_AUID_TO_OBJ_UID: -		return audit_compare_id(tsk->loginuid, -					name, offsetof(struct audit_names, uid), -					f, ctx); +		return audit_compare_uid(tsk->loginuid, name, f, ctx);  	case AUDIT_COMPARE_SUID_TO_OBJ_UID: -		return audit_compare_id(cred->suid, -					name, offsetof(struct audit_names, uid), -					f, ctx); +		return audit_compare_uid(cred->suid, name, f, ctx);  	case AUDIT_COMPARE_SGID_TO_OBJ_GID: -		return audit_compare_id(cred->sgid, -					name, offsetof(struct audit_names, gid), -					f, ctx); +		return audit_compare_gid(cred->sgid, name, f, ctx);  	case AUDIT_COMPARE_FSUID_TO_OBJ_UID: -		return audit_compare_id(cred->fsuid, -					name, offsetof(struct audit_names, uid), -					f, ctx); +		return audit_compare_uid(cred->fsuid, name, f, ctx);  	case AUDIT_COMPARE_FSGID_TO_OBJ_GID: -		return audit_compare_id(cred->fsgid, -					name, offsetof(struct audit_names, gid), -					f, ctx); +		return audit_compare_gid(cred->fsgid, name, f, ctx);  	/* uid comparisons */  	case AUDIT_COMPARE_UID_TO_AUID: -		return audit_comparator(cred->uid, f->op, tsk->loginuid); +		return audit_uid_comparator(cred->uid, f->op, tsk->loginuid);  	case AUDIT_COMPARE_UID_TO_EUID: -		return audit_comparator(cred->uid, f->op, cred->euid); +		return audit_uid_comparator(cred->uid, f->op, cred->euid);  	case AUDIT_COMPARE_UID_TO_SUID: -		return audit_comparator(cred->uid, f->op, cred->suid); +		return audit_uid_comparator(cred->uid, f->op, cred->suid);  	case AUDIT_COMPARE_UID_TO_FSUID: -		return audit_comparator(cred->uid, f->op, cred->fsuid); +		return audit_uid_comparator(cred->uid, f->op, cred->fsuid);  	/* auid comparisons */  	case AUDIT_COMPARE_AUID_TO_EUID: -		return audit_comparator(tsk->loginuid, f->op, cred->euid); +		return audit_uid_comparator(tsk->loginuid, f->op, cred->euid);  	case AUDIT_COMPARE_AUID_TO_SUID: -		return audit_comparator(tsk->loginuid, f->op, cred->suid); +		return audit_uid_comparator(tsk->loginuid, f->op, cred->suid);  	case AUDIT_COMPARE_AUID_TO_FSUID: -		return audit_comparator(tsk->loginuid, f->op, cred->fsuid); +		return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid);  	/* euid comparisons */  	case AUDIT_COMPARE_EUID_TO_SUID: -		return audit_comparator(cred->euid, f->op, cred->suid); +		return audit_uid_comparator(cred->euid, f->op, cred->suid);  	case AUDIT_COMPARE_EUID_TO_FSUID: -		return audit_comparator(cred->euid, f->op, cred->fsuid); +		return audit_uid_comparator(cred->euid, f->op, cred->fsuid);  	/* suid comparisons */  	case AUDIT_COMPARE_SUID_TO_FSUID: -		return audit_comparator(cred->suid, f->op, cred->fsuid); +		return audit_uid_comparator(cred->suid, f->op, cred->fsuid);  	/* gid comparisons */  	case AUDIT_COMPARE_GID_TO_EGID: -		return audit_comparator(cred->gid, f->op, cred->egid); +		return audit_gid_comparator(cred->gid, f->op, cred->egid);  	case AUDIT_COMPARE_GID_TO_SGID: -		return audit_comparator(cred->gid, f->op, cred->sgid); +		return audit_gid_comparator(cred->gid, f->op, cred->sgid);  	case AUDIT_COMPARE_GID_TO_FSGID: -		return audit_comparator(cred->gid, f->op, cred->fsgid); +		return audit_gid_comparator(cred->gid, f->op, cred->fsgid);  	/* egid comparisons */  	case AUDIT_COMPARE_EGID_TO_SGID: -		return audit_comparator(cred->egid, f->op, cred->sgid); +		return audit_gid_comparator(cred->egid, f->op, cred->sgid);  	case AUDIT_COMPARE_EGID_TO_FSGID: -		return audit_comparator(cred->egid, f->op, cred->fsgid); +		return audit_gid_comparator(cred->egid, f->op, cred->fsgid);  	/* sgid comparison */  	case AUDIT_COMPARE_SGID_TO_FSGID: -		return audit_comparator(cred->sgid, f->op, cred->fsgid); +		return audit_gid_comparator(cred->sgid, f->op, cred->fsgid);  	default:  		WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");  		return 0; @@ -630,28 +621,28 @@ static int audit_filter_rules(struct task_struct *tsk,  			}  			break;  		case AUDIT_UID: -			result = audit_comparator(cred->uid, f->op, f->val); +			result = audit_uid_comparator(cred->uid, f->op, f->uid);  			break;  		case AUDIT_EUID: -			result = audit_comparator(cred->euid, f->op, f->val); +			result = audit_uid_comparator(cred->euid, f->op, f->uid);  			break;  		case AUDIT_SUID: -			result = audit_comparator(cred->suid, f->op, f->val); +			result = audit_uid_comparator(cred->suid, f->op, f->uid);  			break;  		case AUDIT_FSUID: -			result = audit_comparator(cred->fsuid, f->op, f->val); +			result = audit_uid_comparator(cred->fsuid, f->op, f->uid);  			break;  		case AUDIT_GID: -			result = audit_comparator(cred->gid, f->op, f->val); +			result = audit_gid_comparator(cred->gid, f->op, f->gid);  			break;  		case AUDIT_EGID: -			result = audit_comparator(cred->egid, f->op, f->val); +			result = audit_gid_comparator(cred->egid, f->op, f->gid);  			break;  		case AUDIT_SGID: -			result = audit_comparator(cred->sgid, f->op, f->val); +			result = audit_gid_comparator(cred->sgid, f->op, f->gid);  			break;  		case AUDIT_FSGID: -			result = audit_comparator(cred->fsgid, f->op, f->val); +			result = audit_gid_comparator(cred->fsgid, f->op, f->gid);  			break;  		case AUDIT_PERS:  			result = audit_comparator(tsk->personality, f->op, f->val); @@ -717,10 +708,10 @@ static int audit_filter_rules(struct task_struct *tsk,  			break;  		case AUDIT_OBJ_UID:  			if (name) { -				result = audit_comparator(name->uid, f->op, f->val); +				result = audit_uid_comparator(name->uid, f->op, f->uid);  			} else if (ctx) {  				list_for_each_entry(n, &ctx->names_list, list) { -					if (audit_comparator(n->uid, f->op, f->val)) { +					if (audit_uid_comparator(n->uid, f->op, f->uid)) {  						++result;  						break;  					} @@ -729,10 +720,10 @@ static int audit_filter_rules(struct task_struct *tsk,  			break;  		case AUDIT_OBJ_GID:  			if (name) { -				result = audit_comparator(name->gid, f->op, f->val); +				result = audit_gid_comparator(name->gid, f->op, f->gid);  			} else if (ctx) {  				list_for_each_entry(n, &ctx->names_list, list) { -					if (audit_comparator(n->gid, f->op, f->val)) { +					if (audit_gid_comparator(n->gid, f->op, f->gid)) {  						++result;  						break;  					} @@ -750,7 +741,7 @@ static int audit_filter_rules(struct task_struct *tsk,  		case AUDIT_LOGINUID:  			result = 0;  			if (ctx) -				result = audit_comparator(tsk->loginuid, f->op, f->val); +				result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);  			break;  		case AUDIT_SUBJ_USER:  		case AUDIT_SUBJ_ROLE: @@ -1006,7 +997,7 @@ static inline void audit_free_names(struct audit_context *context)  		       context->ino_count);  		list_for_each_entry(n, &context->names_list, list) {  			printk(KERN_ERR "names[%d] = %p = %s\n", i, -			       n->name, n->name ?: "(null)"); +			       n->name, n->name->name ?: "(null)");  		}  		dump_stack();  		return; @@ -1154,13 +1145,43 @@ error_path:  EXPORT_SYMBOL(audit_log_task_context); -static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)  { +	const struct cred *cred;  	char name[sizeof(tsk->comm)];  	struct mm_struct *mm = tsk->mm; -	struct vm_area_struct *vma; +	char *tty; + +	if (!ab) +		return;  	/* tsk == current */ +	cred = current_cred(); + +	spin_lock_irq(&tsk->sighand->siglock); +	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) +		tty = tsk->signal->tty->name; +	else +		tty = "(none)"; +	spin_unlock_irq(&tsk->sighand->siglock); + + +	audit_log_format(ab, +			 " ppid=%ld pid=%d auid=%u uid=%u gid=%u" +			 " euid=%u suid=%u fsuid=%u" +			 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", +			 sys_getppid(), +			 tsk->pid, +			 from_kuid(&init_user_ns, tsk->loginuid), +			 from_kuid(&init_user_ns, cred->uid), +			 from_kgid(&init_user_ns, cred->gid), +			 from_kuid(&init_user_ns, cred->euid), +			 from_kuid(&init_user_ns, cred->suid), +			 from_kuid(&init_user_ns, cred->fsuid), +			 from_kgid(&init_user_ns, cred->egid), +			 from_kgid(&init_user_ns, cred->sgid), +			 from_kgid(&init_user_ns, cred->fsgid), +			 tsk->sessionid, tty);  	get_task_comm(name, tsk);  	audit_log_format(ab, " comm="); @@ -1168,23 +1189,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk  	if (mm) {  		down_read(&mm->mmap_sem); -		vma = mm->mmap; -		while (vma) { -			if ((vma->vm_flags & VM_EXECUTABLE) && -			    vma->vm_file) { -				audit_log_d_path(ab, " exe=", -						 &vma->vm_file->f_path); -				break; -			} -			vma = vma->vm_next; -		} +		if (mm->exe_file) +			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);  		up_read(&mm->mmap_sem);  	}  	audit_log_task_context(ab);  } +EXPORT_SYMBOL(audit_log_task_info); +  static int audit_log_pid_context(struct audit_context *context, pid_t pid, -				 uid_t auid, uid_t uid, unsigned int sessionid, +				 kuid_t auid, kuid_t uid, unsigned int sessionid,  				 u32 sid, char *comm)  {  	struct audit_buffer *ab; @@ -1196,8 +1211,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,  	if (!ab)  		return rc; -	audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, -			 uid, sessionid); +	audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, +			 from_kuid(&init_user_ns, auid), +			 from_kuid(&init_user_ns, uid), sessionid);  	if (security_secid_to_secctx(sid, &ctx, &len)) {  		audit_log_format(ab, " obj=(none)");  		rc = 1; @@ -1447,7 +1463,9 @@ static void show_special(struct audit_context *context, int *call_panic)  		u32 osid = context->ipc.osid;  		audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", -			 context->ipc.uid, context->ipc.gid, context->ipc.mode); +				 from_kuid(&init_user_ns, context->ipc.uid), +				 from_kgid(&init_user_ns, context->ipc.gid), +				 context->ipc.mode);  		if (osid) {  			char *ctx = NULL;  			u32 len; @@ -1536,7 +1554,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  		case AUDIT_NAME_FULL:  			/* log the full path */  			audit_log_format(ab, " name="); -			audit_log_untrustedstring(ab, n->name); +			audit_log_untrustedstring(ab, n->name->name);  			break;  		case 0:  			/* name was specified as a relative path and the @@ -1546,7 +1564,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  		default:  			/* log the name's directory component */  			audit_log_format(ab, " name="); -			audit_log_n_untrustedstring(ab, n->name, +			audit_log_n_untrustedstring(ab, n->name->name,  						    n->name_len);  		}  	} else @@ -1560,8 +1578,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  				 MAJOR(n->dev),  				 MINOR(n->dev),  				 n->mode, -				 n->uid, -				 n->gid, +				 from_kuid(&init_user_ns, n->uid), +				 from_kgid(&init_user_ns, n->gid),  				 MAJOR(n->rdev),  				 MINOR(n->rdev));  	} @@ -1585,26 +1603,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,  static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)  { -	const struct cred *cred;  	int i, call_panic = 0;  	struct audit_buffer *ab;  	struct audit_aux_data *aux; -	const char *tty;  	struct audit_names *n;  	/* tsk == current */ -	context->pid = tsk->pid; -	if (!context->ppid) -		context->ppid = sys_getppid(); -	cred = current_cred(); -	context->uid   = cred->uid; -	context->gid   = cred->gid; -	context->euid  = cred->euid; -	context->suid  = cred->suid; -	context->fsuid = cred->fsuid; -	context->egid  = cred->egid; -	context->sgid  = cred->sgid; -	context->fsgid = cred->fsgid;  	context->personality = tsk->personality;  	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); @@ -1619,32 +1623,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts  				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",  				 context->return_code); -	spin_lock_irq(&tsk->sighand->siglock); -	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) -		tty = tsk->signal->tty->name; -	else -		tty = "(none)"; -	spin_unlock_irq(&tsk->sighand->siglock); -  	audit_log_format(ab, -		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" -		  " ppid=%d pid=%d auid=%u uid=%u gid=%u" -		  " euid=%u suid=%u fsuid=%u" -		  " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", -		  context->argv[0], -		  context->argv[1], -		  context->argv[2], -		  context->argv[3], -		  context->name_count, -		  context->ppid, -		  context->pid, -		  tsk->loginuid, -		  context->uid, -		  context->gid, -		  context->euid, context->suid, context->fsuid, -		  context->egid, context->sgid, context->fsgid, tty, -		  tsk->sessionid); - +			 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", +			 context->argv[0], +			 context->argv[1], +			 context->argv[2], +			 context->argv[3], +			 context->name_count);  	audit_log_task_info(ab, tsk);  	audit_log_key(ab, context->filterkey); @@ -2009,7 +1994,8 @@ retry:  #endif  } -static struct audit_names *audit_alloc_name(struct audit_context *context) +static struct audit_names *audit_alloc_name(struct audit_context *context, +						unsigned char type)  {  	struct audit_names *aname; @@ -2024,6 +2010,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)  	}  	aname->ino = (unsigned long)-1; +	aname->type = type;  	list_add_tail(&aname->list, &context->names_list);  	context->name_count++; @@ -2034,13 +2021,36 @@ static struct audit_names *audit_alloc_name(struct audit_context *context)  }  /** + * audit_reusename - fill out filename with info from existing entry + * @uptr: userland ptr to pathname + * + * Search the audit_names list for the current audit context. If there is an + * existing entry with a matching "uptr" then return the filename + * associated with that audit_name. If not, return NULL. + */ +struct filename * +__audit_reusename(const __user char *uptr) +{ +	struct audit_context *context = current->audit_context; +	struct audit_names *n; + +	list_for_each_entry(n, &context->names_list, list) { +		if (!n->name) +			continue; +		if (n->name->uptr == uptr) +			return n->name; +	} +	return NULL; +} + +/**   * audit_getname - add a name to the list   * @name: name to add   *   * Add a name to the list of audit names for this context.   * Called from fs/namei.c:getname().   */ -void __audit_getname(const char *name) +void __audit_getname(struct filename *name)  {  	struct audit_context *context = current->audit_context;  	struct audit_names *n; @@ -2054,13 +2064,19 @@ void __audit_getname(const char *name)  		return;  	} -	n = audit_alloc_name(context); +#if AUDIT_DEBUG +	/* The filename _must_ have a populated ->name */ +	BUG_ON(!name->name); +#endif + +	n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);  	if (!n)  		return;  	n->name = name;  	n->name_len = AUDIT_NAME_FULL;  	n->name_put = true; +	name->aname = n;  	if (!context->pwd.dentry)  		get_fs_pwd(current->fs, &context->pwd); @@ -2073,7 +2089,7 @@ void __audit_getname(const char *name)   * then we delay the putname until syscall exit.   * Called from include/linux/fs.h:putname().   */ -void audit_putname(const char *name) +void audit_putname(struct filename *name)  {  	struct audit_context *context = current->audit_context; @@ -2088,7 +2104,7 @@ void audit_putname(const char *name)  			list_for_each_entry(n, &context->names_list, list)  				printk(KERN_ERR "name[%d] = %p = %s\n", i, -				       n->name, n->name ?: "(null)"); +				       n->name, n->name->name ?: "(null)");  			}  #endif  		__putname(name); @@ -2102,8 +2118,8 @@ void audit_putname(const char *name)  			       " put_count=%d\n",  			       __FILE__, __LINE__,  			       context->serial, context->major, -			       context->in_syscall, name, context->name_count, -			       context->put_count); +			       context->in_syscall, name->name, +			       context->name_count, context->put_count);  			dump_stack();  		}  	} @@ -2146,13 +2162,13 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent  }  /** - * audit_inode - store the inode and device from a lookup + * __audit_inode - store the inode and device from a lookup   * @name: name being audited   * @dentry: dentry being audited - * - * Called from fs/namei.c:path_lookup(). + * @parent: does this dentry represent the parent?   */ -void __audit_inode(const char *name, const struct dentry *dentry) +void __audit_inode(struct filename *name, const struct dentry *dentry, +		   unsigned int parent)  {  	struct audit_context *context = current->audit_context;  	const struct inode *inode = dentry->d_inode; @@ -2161,24 +2177,69 @@ void __audit_inode(const char *name, const struct dentry *dentry)  	if (!context->in_syscall)  		return; +	if (!name) +		goto out_alloc; + +#if AUDIT_DEBUG +	/* The struct filename _must_ have a populated ->name */ +	BUG_ON(!name->name); +#endif +	/* +	 * If we have a pointer to an audit_names entry already, then we can +	 * just use it directly if the type is correct. +	 */ +	n = name->aname; +	if (n) { +		if (parent) { +			if (n->type == AUDIT_TYPE_PARENT || +			    n->type == AUDIT_TYPE_UNKNOWN) +				goto out; +		} else { +			if (n->type != AUDIT_TYPE_PARENT) +				goto out; +		} +	} +  	list_for_each_entry_reverse(n, &context->names_list, list) { -		if (n->name && (n->name == name)) -			goto out; +		/* does the name pointer match? */ +		if (!n->name || n->name->name != name->name) +			continue; + +		/* match the correct record type */ +		if (parent) { +			if (n->type == AUDIT_TYPE_PARENT || +			    n->type == AUDIT_TYPE_UNKNOWN) +				goto out; +		} else { +			if (n->type != AUDIT_TYPE_PARENT) +				goto out; +		}  	} -	/* unable to find the name from a previous getname() */ -	n = audit_alloc_name(context); +out_alloc: +	/* unable to find the name from a previous getname(). Allocate a new +	 * anonymous entry. +	 */ +	n = audit_alloc_name(context, AUDIT_TYPE_NORMAL);  	if (!n)  		return;  out: +	if (parent) { +		n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; +		n->type = AUDIT_TYPE_PARENT; +	} else { +		n->name_len = AUDIT_NAME_FULL; +		n->type = AUDIT_TYPE_NORMAL; +	}  	handle_path(dentry);  	audit_copy_inode(n, dentry, inode);  }  /** - * audit_inode_child - collect inode info for created/removed objects - * @dentry: dentry being audited + * __audit_inode_child - collect inode info for created/removed objects   * @parent: inode of dentry parent + * @dentry: dentry being audited + * @type:   AUDIT_TYPE_* value that we're looking for   *   * For syscalls that create or remove filesystem objects, audit_inode   * can only collect information for the filesystem object's parent. @@ -2188,15 +2249,14 @@ out:   * must be hooked prior, in order to capture the target inode during   * unsuccessful attempts.   */ -void __audit_inode_child(const struct dentry *dentry, -			 const struct inode *parent) +void __audit_inode_child(const struct inode *parent, +			 const struct dentry *dentry, +			 const unsigned char type)  {  	struct audit_context *context = current->audit_context; -	const char *found_parent = NULL, *found_child = NULL;  	const struct inode *inode = dentry->d_inode;  	const char *dname = dentry->d_name.name; -	struct audit_names *n; -	int dirlen = 0; +	struct audit_names *n, *found_parent = NULL, *found_child = NULL;  	if (!context->in_syscall)  		return; @@ -2204,62 +2264,65 @@ void __audit_inode_child(const struct dentry *dentry,  	if (inode)  		handle_one(inode); -	/* parent is more likely, look for it first */ +	/* look for a parent entry first */  	list_for_each_entry(n, &context->names_list, list) { -		if (!n->name) +		if (!n->name || n->type != AUDIT_TYPE_PARENT)  			continue;  		if (n->ino == parent->i_ino && -		    !audit_compare_dname_path(dname, n->name, &dirlen)) { -			n->name_len = dirlen; /* update parent data in place */ -			found_parent = n->name; -			goto add_names; +		    !audit_compare_dname_path(dname, n->name->name, n->name_len)) { +			found_parent = n; +			break;  		}  	} -	/* no matching parent, look for matching child */ +	/* is there a matching child entry? */  	list_for_each_entry(n, &context->names_list, list) { -		if (!n->name) +		/* can only match entries that have a name */ +		if (!n->name || n->type != type)  			continue; -		/* strcmp() is the more likely scenario */ -		if (!strcmp(dname, n->name) || -		     !audit_compare_dname_path(dname, n->name, &dirlen)) { -			if (inode) -				audit_copy_inode(n, NULL, inode); -			else -				n->ino = (unsigned long)-1; -			found_child = n->name; -			goto add_names; +		/* if we found a parent, make sure this one is a child of it */ +		if (found_parent && (n->name != found_parent->name)) +			continue; + +		if (!strcmp(dname, n->name->name) || +		    !audit_compare_dname_path(dname, n->name->name, +						found_parent ? +						found_parent->name_len : +						AUDIT_NAME_FULL)) { +			found_child = n; +			break;  		}  	} -add_names:  	if (!found_parent) { -		n = audit_alloc_name(context); +		/* create a new, "anonymous" parent record */ +		n = audit_alloc_name(context, AUDIT_TYPE_PARENT);  		if (!n)  			return;  		audit_copy_inode(n, NULL, parent);  	}  	if (!found_child) { -		n = audit_alloc_name(context); -		if (!n) +		found_child = audit_alloc_name(context, type); +		if (!found_child)  			return;  		/* Re-use the name belonging to the slot for a matching parent  		 * directory. All names for this context are relinquished in  		 * audit_free_names() */  		if (found_parent) { -			n->name = found_parent; -			n->name_len = AUDIT_NAME_FULL; +			found_child->name = found_parent->name; +			found_child->name_len = AUDIT_NAME_FULL;  			/* don't call __putname() */ -			n->name_put = false; +			found_child->name_put = false;  		} - -		if (inode) -			audit_copy_inode(n, NULL, inode);  	} +	if (inode) +		audit_copy_inode(found_child, dentry, inode); +	else +		found_child->ino = (unsigned long)-1;  }  EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2299,14 +2362,14 @@ static atomic_t session_id = ATOMIC_INIT(0);   *   * Called (set) from fs/proc/base.c::proc_loginuid_write().   */ -int audit_set_loginuid(uid_t loginuid) +int audit_set_loginuid(kuid_t loginuid)  {  	struct task_struct *task = current;  	struct audit_context *context = task->audit_context;  	unsigned int sessionid;  #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE -	if (task->loginuid != -1) +	if (uid_valid(task->loginuid))  		return -EPERM;  #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */  	if (!capable(CAP_AUDIT_CONTROL)) @@ -2322,8 +2385,10 @@ int audit_set_loginuid(uid_t loginuid)  			audit_log_format(ab, "login pid=%d uid=%u "  				"old auid=%u new auid=%u"  				" old ses=%u new ses=%u", -				task->pid, task_uid(task), -				task->loginuid, loginuid, +				task->pid, +				from_kuid(&init_user_ns, task_uid(task)), +				from_kuid(&init_user_ns, task->loginuid), +				from_kuid(&init_user_ns, loginuid),  				task->sessionid, sessionid);  			audit_log_end(ab);  		} @@ -2546,12 +2611,12 @@ int __audit_signal_info(int sig, struct task_struct *t)  	struct audit_aux_data_pids *axp;  	struct task_struct *tsk = current;  	struct audit_context *ctx = tsk->audit_context; -	uid_t uid = current_uid(), t_uid = task_uid(t); +	kuid_t uid = current_uid(), t_uid = task_uid(t);  	if (audit_pid && t->tgid == audit_pid) {  		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {  			audit_sig_pid = tsk->pid; -			if (tsk->loginuid != -1) +			if (uid_valid(tsk->loginuid))  				audit_sig_uid = tsk->loginuid;  			else  				audit_sig_uid = uid; @@ -2672,8 +2737,8 @@ void __audit_mmap_fd(int fd, int flags)  static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)  { -	uid_t auid, uid; -	gid_t gid; +	kuid_t auid, uid; +	kgid_t gid;  	unsigned int sessionid;  	auid = audit_get_loginuid(current); @@ -2681,7 +2746,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)  	current_uid_gid(&uid, &gid);  	audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", -			 auid, uid, gid, sessionid); +			 from_kuid(&init_user_ns, auid), +			 from_kuid(&init_user_ns, uid), +			 from_kgid(&init_user_ns, gid), +			 sessionid);  	audit_log_task_context(ab);  	audit_log_format(ab, " pid=%d comm=", current->pid);  	audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e44..f24f724620d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex);  /*   * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * populated with the built in subsystems, and modular subsystems are   * registered after that. The mutable section of this array is protected by   * cgroup_mutex.   */ -#define SUBSYS(_x) &_x ## _subsys, +#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)  static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {  #include <linux/cgroup_subsys.h>  }; @@ -111,13 +112,13 @@ struct cgroupfs_root {  	 * The bitmask of subsystems intended to be attached to this  	 * hierarchy  	 */ -	unsigned long subsys_bits; +	unsigned long subsys_mask;  	/* Unique id for this hierarchy. */  	int hierarchy_id;  	/* The bitmask of subsystems currently attached to this hierarchy */ -	unsigned long actual_subsys_bits; +	unsigned long actual_subsys_mask;  	/* A list running through the attached subsystems */  	struct list_head subsys_list; @@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)  /* bits in struct cgroupfs_root flags field */  enum { -	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ +	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */ +	ROOT_XATTR,	/* supports extended attributes */  };  static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set(  	 * won't change, so no need for locking.  	 */  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { -		if (root->subsys_bits & (1UL << i)) { +		if (root->subsys_mask & (1UL << i)) {  			/* Subsystem is in this hierarchy. So we want  			 * the subsystem state from the new  			 * cgroup */ @@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);  static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp); +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, +			       unsigned long subsys_mask);  static const struct inode_operations cgroup_dir_inode_operations;  static const struct file_operations proc_cgroupstats_operations; @@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  		 */  		BUG_ON(!list_empty(&cgrp->pidlists)); +		simple_xattrs_free(&cgrp->xattrs); +  		kfree_rcu(cgrp, rcu_head);  	} else {  		struct cfent *cfe = __d_cfe(dentry);  		struct cgroup *cgrp = dentry->d_parent->d_fsdata; +		struct cftype *cft = cfe->type;  		WARN_ONCE(!list_empty(&cfe->node) &&  			  cgrp != &cgrp->root->top_cgroup,  			  "cfe still linked for %s\n", cfe->type->name);  		kfree(cfe); +		simple_xattrs_free(&cft->xattrs);  	}  	iput(inode);  } @@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)  	return -ENOENT;  } -static void cgroup_clear_directory(struct dentry *dir) +/** + * cgroup_clear_directory - selective removal of base and subsystem files + * @dir: directory containing the files + * @base_files: true if the base files should be removed + * @subsys_mask: mask of the subsystem ids whose files should be removed + */ +static void cgroup_clear_directory(struct dentry *dir, bool base_files, +				   unsigned long subsys_mask)  {  	struct cgroup *cgrp = __d_cgrp(dir); +	struct cgroup_subsys *ss; -	while (!list_empty(&cgrp->files)) -		cgroup_rm_file(cgrp, NULL); +	for_each_subsys(cgrp->root, ss) { +		struct cftype_set *set; +		if (!test_bit(ss->subsys_id, &subsys_mask)) +			continue; +		list_for_each_entry(set, &ss->cftsets, node) +			cgroup_rm_file(cgrp, set->cfts); +	} +	if (base_files) { +		while (!list_empty(&cgrp->files)) +			cgroup_rm_file(cgrp, NULL); +	}  }  /* @@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir)  static void cgroup_d_remove_dir(struct dentry *dentry)  {  	struct dentry *parent; +	struct cgroupfs_root *root = dentry->d_sb->s_fs_info; -	cgroup_clear_directory(dentry); +	cgroup_clear_directory(dentry, true, root->subsys_mask);  	parent = dentry->d_parent;  	spin_lock(&parent->d_lock); @@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)   * returns an error, no reference counts are touched.   */  static int rebind_subsystems(struct cgroupfs_root *root, -			      unsigned long final_bits) +			      unsigned long final_subsys_mask)  { -	unsigned long added_bits, removed_bits; +	unsigned long added_mask, removed_mask;  	struct cgroup *cgrp = &root->top_cgroup;  	int i;  	BUG_ON(!mutex_is_locked(&cgroup_mutex));  	BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); -	removed_bits = root->actual_subsys_bits & ~final_bits; -	added_bits = final_bits & ~root->actual_subsys_bits; +	removed_mask = root->actual_subsys_mask & ~final_subsys_mask; +	added_mask = final_subsys_mask & ~root->actual_subsys_mask;  	/* Check that any added subsystems are currently free */  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		unsigned long bit = 1UL << i;  		struct cgroup_subsys *ss = subsys[i]; -		if (!(bit & added_bits)) +		if (!(bit & added_mask))  			continue;  		/*  		 * Nobody should tell us to do a subsys that doesn't exist: @@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i];  		unsigned long bit = 1UL << i; -		if (bit & added_bits) { +		if (bit & added_mask) {  			/* We're binding this subsystem to this hierarchy */  			BUG_ON(ss == NULL);  			BUG_ON(cgrp->subsys[i]); @@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			if (ss->bind)  				ss->bind(cgrp);  			/* refcount was already taken, and we're keeping it */ -		} else if (bit & removed_bits) { +		} else if (bit & removed_mask) {  			/* We're removing this subsystem */  			BUG_ON(ss == NULL);  			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); @@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			list_move(&ss->sibling, &rootnode.subsys_list);  			/* subsystem is now free - drop reference on module */  			module_put(ss->module); -		} else if (bit & final_bits) { +		} else if (bit & final_subsys_mask) {  			/* Subsystem state should already exist */  			BUG_ON(ss == NULL);  			BUG_ON(!cgrp->subsys[i]); @@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,  			BUG_ON(cgrp->subsys[i]);  		}  	} -	root->subsys_bits = root->actual_subsys_bits = final_bits; +	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;  	synchronize_rcu();  	return 0; @@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)  		seq_printf(seq, ",%s", ss->name);  	if (test_bit(ROOT_NOPREFIX, &root->flags))  		seq_puts(seq, ",noprefix"); +	if (test_bit(ROOT_XATTR, &root->flags)) +		seq_puts(seq, ",xattr");  	if (strlen(root->release_agent_path))  		seq_printf(seq, ",release_agent=%s", root->release_agent_path);  	if (clone_children(&root->top_cgroup)) @@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)  }  struct cgroup_sb_opts { -	unsigned long subsys_bits; +	unsigned long subsys_mask;  	unsigned long flags;  	char *release_agent;  	bool clone_children; @@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			opts->clone_children = true;  			continue;  		} +		if (!strcmp(token, "xattr")) { +			set_bit(ROOT_XATTR, &opts->flags); +			continue; +		}  		if (!strncmp(token, "release_agent=", 14)) {  			/* Specifying two release agents is forbidden */  			if (opts->release_agent) @@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			/* Mutually exclusive option 'all' + subsystem name */  			if (all_ss)  				return -EINVAL; -			set_bit(i, &opts->subsys_bits); +			set_bit(i, &opts->subsys_mask);  			one_ss = true;  			break; @@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  				continue;  			if (ss->disabled)  				continue; -			set_bit(i, &opts->subsys_bits); +			set_bit(i, &opts->subsys_mask);  		}  	} @@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	 * the cpuset subsystem.  	 */  	if (test_bit(ROOT_NOPREFIX, &opts->flags) && -	    (opts->subsys_bits & mask)) +	    (opts->subsys_mask & mask))  		return -EINVAL;  	/* Can't specify "none" and some subsystems */ -	if (opts->subsys_bits && opts->none) +	if (opts->subsys_mask && opts->none)  		return -EINVAL;  	/*  	 * We either have to specify by name or by subsystems. (So all  	 * empty hierarchies must have a name).  	 */ -	if (!opts->subsys_bits && !opts->name) +	if (!opts->subsys_mask && !opts->name)  		return -EINVAL;  	/* @@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	 * take duplicate reference counts on a subsystem that's already used,  	 * but rebind_subsystems handles this case.  	 */ -	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { +	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		unsigned long bit = 1UL << i; -		if (!(bit & opts->subsys_bits)) +		if (!(bit & opts->subsys_mask))  			continue;  		if (!try_module_get(subsys[i]->module)) {  			module_pin_failed = true; @@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  		 * raced with a module_delete call, and to the user this is  		 * essentially a "subsystem doesn't exist" case.  		 */ -		for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { +		for (i--; i >= 0; i--) {  			/* drop refcounts only on the ones we took */  			unsigned long bit = 1UL << i; -			if (!(bit & opts->subsys_bits)) +			if (!(bit & opts->subsys_mask))  				continue;  			module_put(subsys[i]->module);  		} @@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	return 0;  } -static void drop_parsed_module_refcounts(unsigned long subsys_bits) +static void drop_parsed_module_refcounts(unsigned long subsys_mask)  {  	int i; -	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { +	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		unsigned long bit = 1UL << i; -		if (!(bit & subsys_bits)) +		if (!(bit & subsys_mask))  			continue;  		module_put(subsys[i]->module);  	} @@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  	struct cgroupfs_root *root = sb->s_fs_info;  	struct cgroup *cgrp = &root->top_cgroup;  	struct cgroup_sb_opts opts; +	unsigned long added_mask, removed_mask;  	mutex_lock(&cgrp->dentry->d_inode->i_mutex);  	mutex_lock(&cgroup_mutex); @@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)  		goto out_unlock;  	/* See feature-removal-schedule.txt */ -	if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) +	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)  		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",  			   task_tgid_nr(current), current->comm); +	added_mask = opts.subsys_mask & ~root->subsys_mask; +	removed_mask = root->subsys_mask & ~opts.subsys_mask; +  	/* Don't allow flags or name to change at remount */  	if (opts.flags != root->flags ||  	    (opts.name && strcmp(opts.name, root->name))) {  		ret = -EINVAL; -		drop_parsed_module_refcounts(opts.subsys_bits); +		drop_parsed_module_refcounts(opts.subsys_mask);  		goto out_unlock;  	} -	ret = rebind_subsystems(root, opts.subsys_bits); +	ret = rebind_subsystems(root, opts.subsys_mask);  	if (ret) { -		drop_parsed_module_refcounts(opts.subsys_bits); +		drop_parsed_module_refcounts(opts.subsys_mask);  		goto out_unlock;  	}  	/* clear out any existing files and repopulate subsystem files */ -	cgroup_clear_directory(cgrp->dentry); -	cgroup_populate_dir(cgrp); +	cgroup_clear_directory(cgrp->dentry, false, removed_mask); +	/* re-populate subsystem files */ +	cgroup_populate_dir(cgrp, false, added_mask);  	if (opts.release_agent)  		strcpy(root->release_agent_path, opts.release_agent); @@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)  	mutex_init(&cgrp->pidlist_mutex);  	INIT_LIST_HEAD(&cgrp->event_list);  	spin_lock_init(&cgrp->event_list_lock); +	simple_xattrs_init(&cgrp->xattrs);  }  static void init_cgroup_root(struct cgroupfs_root *root) @@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data)  	 * If we asked for subsystems (or explicitly for no  	 * subsystems) then they must match  	 */ -	if ((opts->subsys_bits || opts->none) -	    && (opts->subsys_bits != root->subsys_bits)) +	if ((opts->subsys_mask || opts->none) +	    && (opts->subsys_mask != root->subsys_mask))  		return 0;  	return 1; @@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)  {  	struct cgroupfs_root *root; -	if (!opts->subsys_bits && !opts->none) +	if (!opts->subsys_mask && !opts->none)  		return NULL;  	root = kzalloc(sizeof(*root), GFP_KERNEL); @@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)  	}  	init_cgroup_root(root); -	root->subsys_bits = opts->subsys_bits; +	root->subsys_mask = opts->subsys_mask;  	root->flags = opts->flags;  	if (opts->release_agent)  		strcpy(root->release_agent_path, opts->release_agent); @@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data)  	if (!opts->new_root)  		return -EINVAL; -	BUG_ON(!opts->subsys_bits && !opts->none); +	BUG_ON(!opts->subsys_mask && !opts->none);  	ret = set_anon_super(sb, NULL);  	if (ret) @@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		if (ret)  			goto unlock_drop; -		ret = rebind_subsystems(root, root->subsys_bits); +		ret = rebind_subsystems(root, root->subsys_mask);  		if (ret == -EBUSY) {  			free_cg_links(&tmp_cg_links);  			goto unlock_drop; @@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		BUG_ON(root->number_of_cgroups != 1);  		cred = override_creds(&init_cred); -		cgroup_populate_dir(root_cgrp); +		cgroup_populate_dir(root_cgrp, true, root->subsys_mask);  		revert_creds(cred);  		mutex_unlock(&cgroup_root_mutex);  		mutex_unlock(&cgroup_mutex); @@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		 */  		cgroup_drop_root(opts.new_root);  		/* no subsys rebinding, so refcounts don't change */ -		drop_parsed_module_refcounts(opts.subsys_bits); +		drop_parsed_module_refcounts(opts.subsys_mask);  	}  	kfree(opts.release_agent); @@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,   drop_new_super:  	deactivate_locked_super(sb);   drop_modules: -	drop_parsed_module_refcounts(opts.subsys_bits); +	drop_parsed_module_refcounts(opts.subsys_mask);   out_err:  	kfree(opts.release_agent);  	kfree(opts.name); @@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) {  	mutex_unlock(&cgroup_root_mutex);  	mutex_unlock(&cgroup_mutex); +	simple_xattrs_free(&cgrp->xattrs); +  	kill_litter_super(sb);  	cgroup_drop_root(root);  } @@ -1923,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,  	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop  	 * it here; it will be freed under RCU.  	 */ -	put_css_set(oldcg); -  	set_bit(CGRP_RELEASABLE, &oldcgrp->flags); +	put_css_set(oldcg);  }  /** @@ -2551,6 +2589,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,  	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);  } +static struct simple_xattrs *__d_xattrs(struct dentry *dentry) +{ +	if (S_ISDIR(dentry->d_inode->i_mode)) +		return &__d_cgrp(dentry)->xattrs; +	else +		return &__d_cft(dentry)->xattrs; +} + +static inline int xattr_enabled(struct dentry *dentry) +{ +	struct cgroupfs_root *root = dentry->d_sb->s_fs_info; +	return test_bit(ROOT_XATTR, &root->flags); +} + +static bool is_valid_xattr(const char *name) +{ +	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || +	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) +		return true; +	return false; +} + +static int cgroup_setxattr(struct dentry *dentry, const char *name, +			   const void *val, size_t size, int flags) +{ +	if (!xattr_enabled(dentry)) +		return -EOPNOTSUPP; +	if (!is_valid_xattr(name)) +		return -EINVAL; +	return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); +} + +static int cgroup_removexattr(struct dentry *dentry, const char *name) +{ +	if (!xattr_enabled(dentry)) +		return -EOPNOTSUPP; +	if (!is_valid_xattr(name)) +		return -EINVAL; +	return simple_xattr_remove(__d_xattrs(dentry), name); +} + +static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, +			       void *buf, size_t size) +{ +	if (!xattr_enabled(dentry)) +		return -EOPNOTSUPP; +	if (!is_valid_xattr(name)) +		return -EINVAL; +	return simple_xattr_get(__d_xattrs(dentry), name, buf, size); +} + +static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) +{ +	if (!xattr_enabled(dentry)) +		return -EOPNOTSUPP; +	return simple_xattr_list(__d_xattrs(dentry), buf, size); +} +  static const struct file_operations cgroup_file_operations = {  	.read = cgroup_file_read,  	.write = cgroup_file_write, @@ -2559,11 +2655,22 @@ static const struct file_operations cgroup_file_operations = {  	.release = cgroup_file_release,  }; +static const struct inode_operations cgroup_file_inode_operations = { +	.setxattr = cgroup_setxattr, +	.getxattr = cgroup_getxattr, +	.listxattr = cgroup_listxattr, +	.removexattr = cgroup_removexattr, +}; +  static const struct inode_operations cgroup_dir_inode_operations = {  	.lookup = cgroup_lookup,  	.mkdir = cgroup_mkdir,  	.rmdir = cgroup_rmdir,  	.rename = cgroup_rename, +	.setxattr = cgroup_setxattr, +	.getxattr = cgroup_getxattr, +	.listxattr = cgroup_listxattr, +	.removexattr = cgroup_removexattr,  };  static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -2611,6 +2718,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,  	} else if (S_ISREG(mode)) {  		inode->i_size = 0;  		inode->i_fop = &cgroup_file_operations; +		inode->i_op = &cgroup_file_inode_operations;  	}  	d_instantiate(dentry, inode);  	dget(dentry);	/* Extra count - pin the dentry in core */ @@ -2671,7 +2779,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft)  }  static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, -			   const struct cftype *cft) +			   struct cftype *cft)  {  	struct dentry *dir = cgrp->dentry;  	struct cgroup *parent = __d_cgrp(dir); @@ -2681,6 +2789,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,  	umode_t mode;  	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; +	simple_xattrs_init(&cft->xattrs); +  	/* does @cft->flags tell us to skip creation on @cgrp? */  	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)  		return 0; @@ -2721,9 +2831,9 @@ out:  }  static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, -			      const struct cftype cfts[], bool is_add) +			      struct cftype cfts[], bool is_add)  { -	const struct cftype *cft; +	struct cftype *cft;  	int err, ret = 0;  	for (cft = cfts; cft->name[0] != '\0'; cft++) { @@ -2757,7 +2867,7 @@ static void cgroup_cfts_prepare(void)  }  static void cgroup_cfts_commit(struct cgroup_subsys *ss, -			       const struct cftype *cfts, bool is_add) +			       struct cftype *cfts, bool is_add)  	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)  {  	LIST_HEAD(pending); @@ -2808,7 +2918,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,   * function currently returns 0 as long as @cfts registration is successful   * even if some file creation attempts on existing cgroups fail.   */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  {  	struct cftype_set *set; @@ -2838,7 +2948,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);   * Returns 0 on successful unregistration, -ENOENT if @cfts is not   * registered with @ss.   */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) +int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  {  	struct cftype_set *set; @@ -3843,18 +3953,29 @@ static struct cftype files[] = {  	{ }	/* terminate */  }; -static int cgroup_populate_dir(struct cgroup *cgrp) +/** + * cgroup_populate_dir - selectively creation of files in a directory + * @cgrp: target cgroup + * @base_files: true if the base files should be added + * @subsys_mask: mask of the subsystem ids whose files should be added + */ +static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, +			       unsigned long subsys_mask)  {  	int err;  	struct cgroup_subsys *ss; -	err = cgroup_addrm_files(cgrp, NULL, files, true); -	if (err < 0) -		return err; +	if (base_files) { +		err = cgroup_addrm_files(cgrp, NULL, files, true); +		if (err < 0) +			return err; +	}  	/* process cftsets of each subsystem */  	for_each_subsys(cgrp->root, ss) {  		struct cftype_set *set; +		if (!test_bit(ss->subsys_id, &subsys_mask)) +			continue;  		list_for_each_entry(set, &ss->cftsets, node)  			cgroup_addrm_files(cgrp, ss, set->cfts, true); @@ -3954,8 +4075,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);  	for_each_subsys(root, ss) { -		struct cgroup_subsys_state *css = ss->create(cgrp); +		struct cgroup_subsys_state *css; +		css = ss->create(cgrp);  		if (IS_ERR(css)) {  			err = PTR_ERR(css);  			goto err_destroy; @@ -3969,6 +4091,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		/* At error, ->destroy() callback has to free assigned ID. */  		if (clone_children(parent) && ss->post_clone)  			ss->post_clone(cgrp); + +		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && +		    parent->parent) { +			pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", +				   current->comm, current->pid, ss->name); +			if (!strcmp(ss->name, "memory")) +				pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); +			ss->warned_broken_hierarchy = true; +		}  	}  	list_add(&cgrp->sibling, &cgrp->parent->children); @@ -3988,7 +4119,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  	list_add_tail(&cgrp->allcg_node, &root->allcg_list); -	err = cgroup_populate_dir(cgrp); +	err = cgroup_populate_dir(cgrp, true, root->subsys_mask);  	/* If err < 0, we have a half-filled directory - oh well ;) */  	mutex_unlock(&cgroup_mutex); @@ -4321,8 +4452,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	 * since cgroup_init_subsys will have already taken care of it.  	 */  	if (ss->module == NULL) { -		/* a few sanity checks */ -		BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); +		/* a sanity check */  		BUG_ON(subsys[ss->subsys_id] != ss);  		return 0;  	} @@ -4330,24 +4460,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	/* init base cftset */  	cgroup_init_cftsets(ss); -	/* -	 * need to register a subsys id before anything else - for example, -	 * init_cgroup_css needs it. -	 */  	mutex_lock(&cgroup_mutex); -	/* find the first empty slot in the array */ -	for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { -		if (subsys[i] == NULL) -			break; -	} -	if (i == CGROUP_SUBSYS_COUNT) { -		/* maximum number of subsystems already registered! */ -		mutex_unlock(&cgroup_mutex); -		return -EBUSY; -	} -	/* assign ourselves the subsys_id */ -	ss->subsys_id = i; -	subsys[i] = ss; +	subsys[ss->subsys_id] = ss;  	/*  	 * no ss->create seems to need anything important in the ss struct, so @@ -4356,7 +4470,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	css = ss->create(dummytop);  	if (IS_ERR(css)) {  		/* failure case - need to deassign the subsys[] slot. */ -		subsys[i] = NULL; +		subsys[ss->subsys_id] = NULL;  		mutex_unlock(&cgroup_mutex);  		return PTR_ERR(css);  	} @@ -4372,7 +4486,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  		if (ret) {  			dummytop->subsys[ss->subsys_id] = NULL;  			ss->destroy(dummytop); -			subsys[i] = NULL; +			subsys[ss->subsys_id] = NULL;  			mutex_unlock(&cgroup_mutex);  			return ret;  		} @@ -4439,7 +4553,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	mutex_lock(&cgroup_mutex);  	/* deassign the subsys_id */ -	BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT);  	subsys[ss->subsys_id] = NULL;  	/* remove subsystem from rootnode's list of subsystems */ @@ -4502,10 +4615,13 @@ int __init cgroup_init_early(void)  	for (i = 0; i < CSS_SET_TABLE_SIZE; i++)  		INIT_HLIST_HEAD(&css_set_table[i]); -	/* at bootup time, we don't worry about modular subsystems */ -	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i]; +		/* at bootup time, we don't worry about modular subsystems */ +		if (!ss || ss->module) +			continue; +  		BUG_ON(!ss->name);  		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);  		BUG_ON(!ss->create); @@ -4538,9 +4654,12 @@ int __init cgroup_init(void)  	if (err)  		return err; -	/* at bootup time, we don't worry about modular subsystems */ -	for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  		struct cgroup_subsys *ss = subsys[i]; + +		/* at bootup time, we don't worry about modular subsystems */ +		if (!ss || ss->module) +			continue;  		if (!ss->early_init)  			cgroup_init_subsys(ss);  		if (ss->use_id) @@ -4695,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = {   *   * A pointer to the shared css_set was automatically copied in   * fork.c by dup_task_struct().  However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer.  cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer.  cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed.   *   * At the point that cgroup_fork() is called, 'current' is the parent   * task, and the passed argument 'child' points to the child task.   */  void cgroup_fork(struct task_struct *child)  { -	/* -	 * We don't need to task_lock() current because current->cgroups -	 * can't be changed concurrently here. The parent obviously hasn't -	 * exited and called cgroup_exit(), and we are synchronized against -	 * cgroup migration through threadgroup_change_begin(). -	 */ +	task_lock(current);  	child->cgroups = current->cgroups;  	get_css_set(child->cgroups); +	task_unlock(current);  	INIT_LIST_HEAD(&child->cg_list);  } @@ -4735,13 +4843,16 @@ void cgroup_fork_callbacks(struct task_struct *child)  {  	if (need_forkexit_callback) {  		int i; -		/* -		 * forkexit callbacks are only supported for builtin -		 * subsystems, and the builtin section of the subsys array is -		 * immutable, so we don't need to lock the subsys array here. -		 */ -		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i]; + +			/* +			 * forkexit callbacks are only supported for +			 * builtin subsystems. +			 */ +			if (!ss || ss->module) +				continue; +  			if (ss->fork)  				ss->fork(child);  		} @@ -4772,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child)  	 */  	if (use_task_css_set_links) {  		write_lock(&css_set_lock); -		if (list_empty(&child->cg_list)) { -			/* -			 * It's safe to use child->cgroups without task_lock() -			 * here because we are protected through -			 * threadgroup_change_begin() against concurrent -			 * css_set change in cgroup_task_migrate(). Also -			 * the task can't exit at that point until -			 * wake_up_new_task() is called, so we are protected -			 * against cgroup_exit() setting child->cgroup to -			 * init_css_set. -			 */ +		task_lock(child); +		if (list_empty(&child->cg_list))  			list_add(&child->cg_list, &child->cgroups->tasks); -		} +		task_unlock(child);  		write_unlock(&css_set_lock);  	}  } @@ -4846,12 +4948,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)  	tsk->cgroups = &init_css_set;  	if (run_callbacks && need_forkexit_callback) { -		/* -		 * modular subsystems can't use callbacks, so no need to lock -		 * the subsys array -		 */ -		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i]; + +			/* modular subsystems can't use callbacks */ +			if (!ss || ss->module) +				continue; +  			if (ss->exit) {  				struct cgroup *old_cgrp =  					rcu_dereference_raw(cg->subsys[i])->cgroup; @@ -5037,13 +5140,17 @@ static int __init cgroup_disable(char *str)  	while ((token = strsep(&str, ",")) != NULL) {  		if (!*token)  			continue; -		/* -		 * cgroup_disable, being at boot time, can't know about module -		 * subsystems, so we don't worry about them. -		 */ -		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {  			struct cgroup_subsys *ss = subsys[i]; +			/* +			 * cgroup_disable, being at boot time, can't +			 * know about module subsystems, so we don't +			 * worry about them. +			 */ +			if (!ss || ss->module) +				continue; +  			if (!strcmp(token, ss->name)) {  				ss->disabled = 1;  				printk(KERN_INFO "Disabling %s control group" diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3ea..b1724ce9898 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {  	.can_attach	= freezer_can_attach,  	.fork		= freezer_fork,  	.base_cftypes	= files, + +	/* +	 * freezer subsys doesn't handle hierarchy at all.  Frozen state +	 * should be inherited through the hierarchy - if a parent is +	 * frozen, all its children should be frozen.  Fix it and remove +	 * the following. +	 */ +	.broken_hierarchy = true,  }; diff --git a/kernel/cpu.c b/kernel/cpu.c index f560598807c..42bd331ee0a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,6 +80,10 @@ void put_online_cpus(void)  	if (cpu_hotplug.active_writer == current)  		return;  	mutex_lock(&cpu_hotplug.lock); + +	if (WARN_ON(!cpu_hotplug.refcount)) +		cpu_hotplug.refcount++; /* try to fix things up */ +  	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))  		wake_up_process(cpu_hotplug.active_writer);  	mutex_unlock(&cpu_hotplug.lock); diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d8..48cea3da6d0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,  	       atomic_read(&cred->usage),  	       read_cred_subscribers(cred));  	printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", -	       cred->uid, cred->euid, cred->suid, cred->fsuid); +		from_kuid_munged(&init_user_ns, cred->uid), +		from_kuid_munged(&init_user_ns, cred->euid), +		from_kuid_munged(&init_user_ns, cred->suid), +		from_kuid_munged(&init_user_ns, cred->fsuid));  	printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", -	       cred->gid, cred->egid, cred->sgid, cred->fsgid); +		from_kgid_munged(&init_user_ns, cred->gid), +		from_kgid_munged(&init_user_ns, cred->egid), +		from_kgid_munged(&init_user_ns, cred->sgid), +		from_kgid_munged(&init_user_ns, cred->fsgid));  #ifdef CONFIG_SECURITY  	printk(KERN_ERR "CRED: ->security is %p\n", cred->security);  	if ((unsigned long) cred->security >= PAGE_SIZE && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bc..9a61738cefc 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)  {  	struct kgdb_state kgdb_var;  	struct kgdb_state *ks = &kgdb_var; +	int ret = 0; + +	if (arch_kgdb_ops.enable_nmi) +		arch_kgdb_ops.enable_nmi(0);  	ks->cpu			= raw_smp_processor_id();  	ks->ex_vector		= evector; @@ -681,13 +685,33 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)  	ks->linux_regs		= regs;  	if (kgdb_reenter_check(ks)) -		return 0; /* Ouch, double exception ! */ +		goto out; /* Ouch, double exception ! */  	if (kgdb_info[ks->cpu].enter_kgdb != 0) -		return 0; +		goto out; -	return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +	ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +out: +	if (arch_kgdb_ops.enable_nmi) +		arch_kgdb_ops.enable_nmi(1); +	return ret;  } +/* + * GDB places a breakpoint at this function to know dynamically + * loaded objects. It's not defined static so that only one instance with this + * name exists in the kernel. + */ + +static int module_event(struct notifier_block *self, unsigned long val, +	void *data) +{ +	return 0; +} + +static struct notifier_block dbg_module_load_nb = { +	.notifier_call	= module_event, +}; +  int kgdb_nmicallback(int cpu, void *regs)  {  #ifdef CONFIG_SMP @@ -816,6 +840,7 @@ static void kgdb_register_callbacks(void)  		kgdb_arch_init();  		if (!dbg_is_early)  			kgdb_arch_late(); +		register_module_notifier(&dbg_module_load_nb);  		register_reboot_notifier(&dbg_reboot_notifier);  		atomic_notifier_chain_register(&panic_notifier_list,  					       &kgdb_panic_event_nb); @@ -839,6 +864,7 @@ static void kgdb_unregister_callbacks(void)  	if (kgdb_io_module_registered) {  		kgdb_io_module_registered = 0;  		unregister_reboot_notifier(&dbg_reboot_notifier); +		unregister_module_notifier(&dbg_module_load_nb);  		atomic_notifier_chain_unregister(&panic_notifier_list,  					       &kgdb_panic_event_nb);  		kgdb_arch_exit(); diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 07c9bbb94a0..b03e0e814e4 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -129,6 +129,8 @@ kdb_bt(int argc, const char **argv)  		}  		/* Now the inactive tasks */  		kdb_do_each_thread(g, p) { +			if (KDB_FLAG(CMD_INTERRUPT)) +				return 0;  			if (task_curr(p))  				continue;  			if (kdb_bt1(p, mask, argcount, btaprompt)) diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0a69d2adc4f..14ff4849262 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -552,6 +552,7 @@ int vkdb_printf(const char *fmt, va_list ap)  {  	int diag;  	int linecount; +	int colcount;  	int logging, saved_loglevel = 0;  	int saved_trap_printk;  	int got_printf_lock = 0; @@ -584,6 +585,10 @@ int vkdb_printf(const char *fmt, va_list ap)  	if (diag || linecount <= 1)  		linecount = 24; +	diag = kdbgetintenv("COLUMNS", &colcount); +	if (diag || colcount <= 1) +		colcount = 80; +  	diag = kdbgetintenv("LOGGING", &logging);  	if (diag)  		logging = 0; @@ -690,7 +695,7 @@ kdb_printit:  		gdbstub_msg_write(kdb_buffer, retlen);  	} else {  		if (dbg_io_ops && !dbg_io_ops->is_console) { -			len = strlen(kdb_buffer); +			len = retlen;  			cp = kdb_buffer;  			while (len--) {  				dbg_io_ops->write_char(*cp); @@ -709,11 +714,29 @@ kdb_printit:  		printk(KERN_INFO "%s", kdb_buffer);  	} -	if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) -		kdb_nextline++; +	if (KDB_STATE(PAGER)) { +		/* +		 * Check printed string to decide how to bump the +		 * kdb_nextline to control when the more prompt should +		 * show up. +		 */ +		int got = 0; +		len = retlen; +		while (len--) { +			if (kdb_buffer[len] == '\n') { +				kdb_nextline++; +				got = 0; +			} else if (kdb_buffer[len] == '\r') { +				got = 0; +			} else { +				got++; +			} +		} +		kdb_nextline += got / (colcount + 1); +	}  	/* check for having reached the LINES number of printed lines */ -	if (kdb_nextline == linecount) { +	if (kdb_nextline >= linecount) {  		char buf1[16] = "";  		/* Watch out for recursion here.  Any routine that calls @@ -765,7 +788,7 @@ kdb_printit:  			kdb_grepping_flag = 0;  			kdb_printf("\n");  		} else if (buf1[0] == ' ') { -			kdb_printf("\n"); +			kdb_printf("\r");  			suspend_grep = 1; /* for this recursion */  		} else if (buf1[0] == '\n') {  			kdb_nextline = linecount - 1; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a..4d5f8d5612f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -21,6 +21,7 @@  #include <linux/smp.h>  #include <linux/utsname.h>  #include <linux/vmalloc.h> +#include <linux/atomic.h>  #include <linux/module.h>  #include <linux/mm.h>  #include <linux/init.h> @@ -2100,6 +2101,8 @@ static int kdb_dmesg(int argc, const char **argv)  		}  		if (!lines--)  			break; +		if (KDB_FLAG(CMD_INTERRUPT)) +			return 0;  		kdb_printf("%.*s\n", (int)len - 1, buf);  	} @@ -2107,6 +2110,32 @@ static int kdb_dmesg(int argc, const char **argv)  	return 0;  }  #endif /* CONFIG_PRINTK */ + +/* Make sure we balance enable/disable calls, must disable first. */ +static atomic_t kdb_nmi_disabled; + +static int kdb_disable_nmi(int argc, const char *argv[]) +{ +	if (atomic_read(&kdb_nmi_disabled)) +		return 0; +	atomic_set(&kdb_nmi_disabled, 1); +	arch_kgdb_ops.enable_nmi(0); +	return 0; +} + +static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) +{ +	if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) +		return -EINVAL; +	arch_kgdb_ops.enable_nmi(1); +	return 0; +} + +static const struct kernel_param_ops kdb_param_ops_enable_nmi = { +	.set = kdb_param_enable_nmi, +}; +module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); +  /*   * kdb_cpu - This function implements the 'cpu' command.   *	cpu	[<cpunum>] @@ -2851,6 +2880,10 @@ static void __init kdb_inittab(void)  	kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",  	  "Display syslog buffer", 0, KDB_REPEAT_NONE);  #endif +	if (arch_kgdb_ops.enable_nmi) { +		kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", +		  "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); +	}  	kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",  	  "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);  	kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", diff --git a/kernel/events/core.c b/kernel/events/core.c index 7b9df353ba1..dbccf83c134 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -372,6 +372,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); +		if (cpuctx->unique_pmu != pmu) +			continue; /* ensure we process each cpuctx once */  		/*  		 * perf_cgroup_events says at least one @@ -395,9 +397,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)  			if (mode & PERF_CGROUP_SWIN) {  				WARN_ON_ONCE(cpuctx->cgrp); -				/* set cgrp before ctxsw in to -				 * allow event_filter_match() to not -				 * have to pass task around +				/* +				 * set cgrp before ctxsw in to allow +				 * event_filter_match() to not have to pass +				 * task around  				 */  				cpuctx->cgrp = perf_cgroup_from_task(task);  				cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); @@ -468,14 +471,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  {  	struct perf_cgroup *cgrp;  	struct cgroup_subsys_state *css; -	struct file *file; -	int ret = 0, fput_needed; +	struct fd f = fdget(fd); +	int ret = 0; -	file = fget_light(fd, &fput_needed); -	if (!file) +	if (!f.file)  		return -EBADF; -	css = cgroup_css_from_dir(file, perf_subsys_id); +	css = cgroup_css_from_dir(f.file, perf_subsys_id);  	if (IS_ERR(css)) {  		ret = PTR_ERR(css);  		goto out; @@ -501,7 +503,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,  		ret = -EINVAL;  	}  out: -	fput_light(file, fput_needed); +	fdput(f);  	return ret;  } @@ -3234,21 +3236,18 @@ unlock:  static const struct file_operations perf_fops; -static struct file *perf_fget_light(int fd, int *fput_needed) +static inline int perf_fget_light(int fd, struct fd *p)  { -	struct file *file; - -	file = fget_light(fd, fput_needed); -	if (!file) -		return ERR_PTR(-EBADF); +	struct fd f = fdget(fd); +	if (!f.file) +		return -EBADF; -	if (file->f_op != &perf_fops) { -		fput_light(file, *fput_needed); -		*fput_needed = 0; -		return ERR_PTR(-EBADF); +	if (f.file->f_op != &perf_fops) { +		fdput(f); +		return -EBADF;  	} - -	return file; +	*p = f; +	return 0;  }  static int perf_event_set_output(struct perf_event *event, @@ -3280,22 +3279,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)  	case PERF_EVENT_IOC_SET_OUTPUT:  	{ -		struct file *output_file = NULL; -		struct perf_event *output_event = NULL; -		int fput_needed = 0;  		int ret; -  		if (arg != -1) { -			output_file = perf_fget_light(arg, &fput_needed); -			if (IS_ERR(output_file)) -				return PTR_ERR(output_file); -			output_event = output_file->private_data; +			struct perf_event *output_event; +			struct fd output; +			ret = perf_fget_light(arg, &output); +			if (ret) +				return ret; +			output_event = output.file->private_data; +			ret = perf_event_set_output(event, output_event); +			fdput(output); +		} else { +			ret = perf_event_set_output(event, NULL);  		} - -		ret = perf_event_set_output(event, output_event); -		if (output_event) -			fput_light(output_file, fput_needed); -  		return ret;  	} @@ -3678,7 +3674,7 @@ unlock:  		atomic_inc(&event->mmap_count);  	mutex_unlock(&event->mmap_mutex); -	vma->vm_flags |= VM_RESERVED; +	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;  	vma->vm_ops = &perf_mmap_vmops;  	return ret; @@ -4419,7 +4415,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -4565,7 +4561,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -4761,7 +4757,7 @@ got_name:  	rcu_read_lock();  	list_for_each_entry_rcu(pmu, &pmus, entry) {  		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); -		if (cpuctx->active_pmu != pmu) +		if (cpuctx->unique_pmu != pmu)  			goto next;  		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,  					vma->vm_flags & VM_EXEC); @@ -5862,8 +5858,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)  		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); -		if (cpuctx->active_pmu == old_pmu) -			cpuctx->active_pmu = pmu; +		if (cpuctx->unique_pmu == old_pmu) +			cpuctx->unique_pmu = pmu;  	}  } @@ -5998,7 +5994,7 @@ skip_type:  		cpuctx->ctx.pmu = pmu;  		cpuctx->jiffies_interval = 1;  		INIT_LIST_HEAD(&cpuctx->rotation_list); -		cpuctx->active_pmu = pmu; +		cpuctx->unique_pmu = pmu;  	}  got_cpu_context: @@ -6443,12 +6439,11 @@ SYSCALL_DEFINE5(perf_event_open,  	struct perf_event_attr attr;  	struct perf_event_context *ctx;  	struct file *event_file = NULL; -	struct file *group_file = NULL; +	struct fd group = {NULL, 0};  	struct task_struct *task = NULL;  	struct pmu *pmu;  	int event_fd;  	int move_group = 0; -	int fput_needed = 0;  	int err;  	/* for future expandability... */ @@ -6478,17 +6473,15 @@ SYSCALL_DEFINE5(perf_event_open,  	if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))  		return -EINVAL; -	event_fd = get_unused_fd_flags(O_RDWR); +	event_fd = get_unused_fd();  	if (event_fd < 0)  		return event_fd;  	if (group_fd != -1) { -		group_file = perf_fget_light(group_fd, &fput_needed); -		if (IS_ERR(group_file)) { -			err = PTR_ERR(group_file); +		err = perf_fget_light(group_fd, &group); +		if (err)  			goto err_fd; -		} -		group_leader = group_file->private_data; +		group_leader = group.file->private_data;  		if (flags & PERF_FLAG_FD_OUTPUT)  			output_event = group_leader;  		if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6664,7 +6657,7 @@ SYSCALL_DEFINE5(perf_event_open,  	 * of the group leader will find the pointer to itself in  	 * perf_group_detach().  	 */ -	fput_light(group_file, fput_needed); +	fdput(group);  	fd_install(event_fd, event_file);  	return event_fd; @@ -6678,7 +6671,7 @@ err_task:  	if (task)  		put_task_struct(task);  err_group_fd: -	fput_light(group_file, fput_needed); +	fdput(group);  err_fd:  	put_unused_fd(event_fd);  	return err; @@ -7503,5 +7496,12 @@ struct cgroup_subsys perf_subsys = {  	.destroy	= perf_cgroup_destroy,  	.exit		= perf_cgroup_exit,  	.attach		= perf_cgroup_attach, + +	/* +	 * perf_event cgroup doesn't handle nesting correctly. +	 * ctx->nr_cgroups adjustments should be propagated through the +	 * cgroup hierarchy.  Fix it and remove the following. +	 */ +	.broken_hierarchy = true,  };  #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 912ef48d28a..5cc4e7e42e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -78,15 +78,23 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];   */  static atomic_t uprobe_events = ATOMIC_INIT(0); +/* Have a copy of original instruction */ +#define UPROBE_COPY_INSN	0 +/* Dont run handlers when first register/ last unregister in progress*/ +#define UPROBE_RUN_HANDLER	1 +/* Can skip singlestep */ +#define UPROBE_SKIP_SSTEP	2 +  struct uprobe {  	struct rb_node		rb_node;	/* node in the rb tree */  	atomic_t		ref;  	struct rw_semaphore	consumer_rwsem; +	struct mutex		copy_mutex;	/* TODO: kill me and UPROBE_COPY_INSN */  	struct list_head	pending_list;  	struct uprobe_consumer	*consumers;  	struct inode		*inode;		/* Also hold a ref to inode */  	loff_t			offset; -	int			flags; +	unsigned long		flags;  	struct arch_uprobe	arch;  }; @@ -100,17 +108,12 @@ struct uprobe {   */  static bool valid_vma(struct vm_area_struct *vma, bool is_register)  { -	if (!vma->vm_file) -		return false; - -	if (!is_register) -		return true; +	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; -	if ((vma->vm_flags & (VM_HUGETLB|VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)) -				== (VM_READ|VM_EXEC)) -		return true; +	if (is_register) +		flags |= VM_WRITE; -	return false; +	return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;  }  static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) @@ -141,10 +144,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	spinlock_t *ptl;  	pte_t *ptep;  	int err; +	/* For mmu_notifiers */ +	const unsigned long mmun_start = addr; +	const unsigned long mmun_end   = addr + PAGE_SIZE;  	/* For try_to_free_swap() and munlock_vma_page() below */  	lock_page(page); +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);  	err = -EAGAIN;  	ptep = page_check_address(page, mm, addr, &ptl, 0);  	if (!ptep) @@ -173,6 +180,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	err = 0;   unlock: +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  	unlock_page(page);  	return err;  } @@ -188,19 +196,44 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)  	return *insn == UPROBE_SWBP_INSN;  } +static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +{ +	void *kaddr = kmap_atomic(page); +	memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); +	kunmap_atomic(kaddr); +} + +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +{ +	uprobe_opcode_t old_opcode; +	bool is_swbp; + +	copy_opcode(page, vaddr, &old_opcode); +	is_swbp = is_swbp_insn(&old_opcode); + +	if (is_swbp_insn(new_opcode)) { +		if (is_swbp)		/* register: already installed? */ +			return 0; +	} else { +		if (!is_swbp)		/* unregister: was it changed by us? */ +			return 0; +	} + +	return 1; +} +  /*   * NOTE:   * Expect the breakpoint instruction to be the smallest size instruction for   * the architecture. If an arch has variable length instruction and the   * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify read_opcode / + * supported by that architecture then we need to modify is_swbp_at_addr and   * write_opcode accordingly. This would never be a problem for archs that   * have fixed length instructions.   */  /*   * write_opcode - write the opcode at a given virtual address. - * @auprobe: arch breakpointing information.   * @mm: the probed process address space.   * @vaddr: the virtual address to store the opcode.   * @opcode: opcode to be written at @vaddr. @@ -211,8 +244,8 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)   * For mm @mm, write the opcode at @vaddr.   * Return 0 (success) or a negative errno.   */ -static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, -			unsigned long vaddr, uprobe_opcode_t opcode) +static int write_opcode(struct mm_struct *mm, unsigned long vaddr, +			uprobe_opcode_t opcode)  {  	struct page *old_page, *new_page;  	void *vaddr_old, *vaddr_new; @@ -221,10 +254,14 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,  retry:  	/* Read the page with vaddr into memory */ -	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); +	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);  	if (ret <= 0)  		return ret; +	ret = verify_opcode(old_page, vaddr, &opcode); +	if (ret <= 0) +		goto put_old; +  	ret = -ENOMEM;  	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);  	if (!new_page) @@ -259,63 +296,6 @@ put_old:  }  /** - * read_opcode - read the opcode at a given virtual address. - * @mm: the probed process address space. - * @vaddr: the virtual address to read the opcode. - * @opcode: location to store the read opcode. - * - * Called with mm->mmap_sem held (for read and with a reference to - * mm. - * - * For mm @mm, read the opcode at @vaddr and store it in @opcode. - * Return 0 (success) or a negative errno. - */ -static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t *opcode) -{ -	struct page *page; -	void *vaddr_new; -	int ret; - -	ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); -	if (ret <= 0) -		return ret; - -	vaddr_new = kmap_atomic(page); -	vaddr &= ~PAGE_MASK; -	memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); -	kunmap_atomic(vaddr_new); - -	put_page(page); - -	return 0; -} - -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) -{ -	uprobe_opcode_t opcode; -	int result; - -	if (current->mm == mm) { -		pagefault_disable(); -		result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, -								sizeof(opcode)); -		pagefault_enable(); - -		if (likely(result == 0)) -			goto out; -	} - -	result = read_opcode(mm, vaddr, &opcode); -	if (result) -		return result; -out: -	if (is_swbp_insn(&opcode)) -		return 1; - -	return 0; -} - -/**   * set_swbp - store breakpoint at a given address.   * @auprobe: arch specific probepoint information.   * @mm: the probed process address space. @@ -326,18 +306,7 @@ out:   */  int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)  { -	int result; -	/* -	 * See the comment near uprobes_hash(). -	 */ -	result = is_swbp_at_addr(mm, vaddr); -	if (result == 1) -		return 0; - -	if (result) -		return result; - -	return write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN); +	return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);  }  /** @@ -352,16 +321,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned  int __weak  set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)  { -	int result; - -	result = is_swbp_at_addr(mm, vaddr); -	if (!result) -		return -EINVAL; - -	if (result != 1) -		return result; - -	return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); +	return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);  }  static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -468,7 +428,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)  	spin_unlock(&uprobes_treelock);  	/* For now assume that the instruction need not be single-stepped */ -	uprobe->flags |= UPROBE_SKIP_SSTEP; +	__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);  	return u;  } @@ -490,6 +450,7 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)  	uprobe->inode = igrab(inode);  	uprobe->offset = offset;  	init_rwsem(&uprobe->consumer_rwsem); +	mutex_init(&uprobe->copy_mutex);  	/* add to uprobes_tree, sorted on inode:offset */  	cur_uprobe = insert_uprobe(uprobe); @@ -510,7 +471,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)  {  	struct uprobe_consumer *uc; -	if (!(uprobe->flags & UPROBE_RUN_HANDLER)) +	if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))  		return;  	down_read(&uprobe->consumer_rwsem); @@ -616,29 +577,43 @@ static int copy_insn(struct uprobe *uprobe, struct file *filp)  	return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);  } -/* - * How mm->uprobes_state.count gets updated - * uprobe_mmap() increments the count if - * 	- it successfully adds a breakpoint. - * 	- it cannot add a breakpoint, but sees that there is a underlying - * 	  breakpoint (via a is_swbp_at_addr()). - * - * uprobe_munmap() decrements the count if - * 	- it sees a underlying breakpoint, (via is_swbp_at_addr) - * 	  (Subsequent uprobe_unregister wouldnt find the breakpoint - * 	  unless a uprobe_mmap kicks in, since the old vma would be - * 	  dropped just after uprobe_munmap.) - * - * uprobe_register increments the count if: - * 	- it successfully adds a breakpoint. - * - * uprobe_unregister decrements the count if: - * 	- it sees a underlying breakpoint and removes successfully. - * 	  (via is_swbp_at_addr) - * 	  (Subsequent uprobe_munmap wouldnt find the breakpoint - * 	  since there is no underlying breakpoint after the - * 	  breakpoint removal.) - */ +static int prepare_uprobe(struct uprobe *uprobe, struct file *file, +				struct mm_struct *mm, unsigned long vaddr) +{ +	int ret = 0; + +	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) +		return ret; + +	mutex_lock(&uprobe->copy_mutex); +	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) +		goto out; + +	ret = copy_insn(uprobe, file); +	if (ret) +		goto out; + +	ret = -ENOTSUPP; +	if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) +		goto out; + +	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); +	if (ret) +		goto out; + +	/* write_opcode() assumes we don't cross page boundary */ +	BUG_ON((uprobe->offset & ~PAGE_MASK) + +			UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); + +	smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ +	set_bit(UPROBE_COPY_INSN, &uprobe->flags); + + out: +	mutex_unlock(&uprobe->copy_mutex); + +	return ret; +} +  static int  install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,  			struct vm_area_struct *vma, unsigned long vaddr) @@ -656,24 +631,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,  	if (!uprobe->consumers)  		return 0; -	if (!(uprobe->flags & UPROBE_COPY_INSN)) { -		ret = copy_insn(uprobe, vma->vm_file); -		if (ret) -			return ret; - -		if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) -			return -ENOTSUPP; - -		ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); -		if (ret) -			return ret; - -		/* write_opcode() assumes we don't cross page boundary */ -		BUG_ON((uprobe->offset & ~PAGE_MASK) + -				UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); - -		uprobe->flags |= UPROBE_COPY_INSN; -	} +	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); +	if (ret) +		return ret;  	/*  	 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), @@ -692,15 +652,15 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,  	return ret;  } -static void +static int  remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)  {  	/* can happen if uprobe_register() fails */  	if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) -		return; +		return 0;  	set_bit(MMF_RECALC_UPROBES, &mm->flags); -	set_orig_insn(&uprobe->arch, mm, vaddr); +	return set_orig_insn(&uprobe->arch, mm, vaddr);  }  /* @@ -735,7 +695,6 @@ static struct map_info *  build_map_info(struct address_space *mapping, loff_t offset, bool is_register)  {  	unsigned long pgoff = offset >> PAGE_SHIFT; -	struct prio_tree_iter iter;  	struct vm_area_struct *vma;  	struct map_info *curr = NULL;  	struct map_info *prev = NULL; @@ -744,7 +703,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)   again:  	mutex_lock(&mapping->i_mmap_mutex); -	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { +	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {  		if (!valid_vma(vma, is_register))  			continue; @@ -816,7 +775,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)  		struct mm_struct *mm = info->mm;  		struct vm_area_struct *vma; -		if (err) +		if (err && is_register)  			goto free;  		down_write(&mm->mmap_sem); @@ -832,7 +791,7 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)  		if (is_register)  			err = install_breakpoint(uprobe, mm, vma, info->vaddr);  		else -			remove_breakpoint(uprobe, mm, info->vaddr); +			err |= remove_breakpoint(uprobe, mm, info->vaddr);   unlock:  		up_write(&mm->mmap_sem); @@ -889,13 +848,15 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *  	mutex_lock(uprobes_hash(inode));  	uprobe = alloc_uprobe(inode, offset); -	if (uprobe && !consumer_add(uprobe, uc)) { +	if (!uprobe) { +		ret = -ENOMEM; +	} else if (!consumer_add(uprobe, uc)) {  		ret = __uprobe_register(uprobe);  		if (ret) {  			uprobe->consumers = NULL;  			__uprobe_unregister(uprobe);  		} else { -			uprobe->flags |= UPROBE_RUN_HANDLER; +			set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);  		}  	} @@ -928,7 +889,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume  	if (consumer_del(uprobe, uc)) {  		if (!uprobe->consumers) {  			__uprobe_unregister(uprobe); -			uprobe->flags &= ~UPROBE_RUN_HANDLER; +			clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);  		}  	} @@ -1389,10 +1350,11 @@ bool uprobe_deny_signal(void)   */  static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)  { -	if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) -		return true; - -	uprobe->flags &= ~UPROBE_SKIP_SSTEP; +	if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { +		if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) +			return true; +		clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); +	}  	return false;  } @@ -1415,6 +1377,30 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)  	clear_bit(MMF_HAS_UPROBES, &mm->flags);  } +static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +{ +	struct page *page; +	uprobe_opcode_t opcode; +	int result; + +	pagefault_disable(); +	result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, +							sizeof(opcode)); +	pagefault_enable(); + +	if (likely(result == 0)) +		goto out; + +	result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); +	if (result < 0) +		return result; + +	copy_opcode(page, vaddr, &opcode); +	put_page(page); + out: +	return is_swbp_insn(&opcode); +} +  static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)  {  	struct mm_struct *mm = current->mm; @@ -1485,38 +1471,41 @@ static void handle_swbp(struct pt_regs *regs)  		}  		return;  	} +	/* +	 * TODO: move copy_insn/etc into _register and remove this hack. +	 * After we hit the bp, _unregister + _register can install the +	 * new and not-yet-analyzed uprobe at the same address, restart. +	 */ +	smp_rmb(); /* pairs with wmb() in install_breakpoint() */ +	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) +		goto restart;  	utask = current->utask;  	if (!utask) {  		utask = add_utask();  		/* Cannot allocate; re-execute the instruction. */  		if (!utask) -			goto cleanup_ret; +			goto restart;  	} -	utask->active_uprobe = uprobe; +  	handler_chain(uprobe, regs); -	if (uprobe->flags & UPROBE_SKIP_SSTEP && can_skip_sstep(uprobe, regs)) -		goto cleanup_ret; +	if (can_skip_sstep(uprobe, regs)) +		goto out; -	utask->state = UTASK_SSTEP;  	if (!pre_ssout(uprobe, regs, bp_vaddr)) {  		arch_uprobe_enable_step(&uprobe->arch); +		utask->active_uprobe = uprobe; +		utask->state = UTASK_SSTEP;  		return;  	} -cleanup_ret: -	if (utask) { -		utask->active_uprobe = NULL; -		utask->state = UTASK_RUNNING; -	} -	if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - -		/* -		 * cannot singlestep; cannot skip instruction; -		 * re-execute the instruction. -		 */ -		instruction_pointer_set(regs, bp_vaddr); - +restart: +	/* +	 * cannot singlestep; cannot skip instruction; +	 * re-execute the instruction. +	 */ +	instruction_pointer_set(regs, bp_vaddr); +out:  	put_uprobe(uprobe);  } @@ -1548,13 +1537,12 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)  }  /* - * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag.  (and on - * subsequent probe hits on the thread sets the state to UTASK_BP_HIT) and - * allows the thread to return from interrupt. + * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and + * allows the thread to return from interrupt. After that handle_swbp() + * sets utask->active_uprobe.   * - * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag and - * also sets the state to UTASK_SSTEP_ACK and allows the thread to return from - * interrupt. + * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag + * and allows the thread to return from interrupt.   *   * While returning to userspace, thread notices the TIF_UPROBE flag and calls   * uprobe_notify_resume(). @@ -1563,11 +1551,13 @@ void uprobe_notify_resume(struct pt_regs *regs)  {  	struct uprobe_task *utask; +	clear_thread_flag(TIF_UPROBE); +  	utask = current->utask; -	if (!utask || utask->state == UTASK_BP_HIT) -		handle_swbp(regs); -	else +	if (utask && utask->active_uprobe)  		handle_singlestep(utask, regs); +	else +		handle_swbp(regs);  }  /* @@ -1576,17 +1566,10 @@ void uprobe_notify_resume(struct pt_regs *regs)   */  int uprobe_pre_sstep_notifier(struct pt_regs *regs)  { -	struct uprobe_task *utask; -  	if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags))  		return 0; -	utask = current->utask; -	if (utask) -		utask->state = UTASK_BP_HIT; -  	set_thread_flag(TIF_UPROBE); -  	return 1;  } diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5b..346616c0092 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -457,108 +457,13 @@ void daemonize(const char *name, ...)  	/* Become as one with the init task */  	daemonize_fs_struct(); -	exit_files(current); -	current->files = init_task.files; -	atomic_inc(¤t->files->count); +	daemonize_descriptors();  	reparent_to_kthreadd();  }  EXPORT_SYMBOL(daemonize); -static void close_files(struct files_struct * files) -{ -	int i, j; -	struct fdtable *fdt; - -	j = 0; - -	/* -	 * It is safe to dereference the fd table without RCU or -	 * ->file_lock because this is the last reference to the -	 * files structure.  But use RCU to shut RCU-lockdep up. -	 */ -	rcu_read_lock(); -	fdt = files_fdtable(files); -	rcu_read_unlock(); -	for (;;) { -		unsigned long set; -		i = j * BITS_PER_LONG; -		if (i >= fdt->max_fds) -			break; -		set = fdt->open_fds[j++]; -		while (set) { -			if (set & 1) { -				struct file * file = xchg(&fdt->fd[i], NULL); -				if (file) { -					filp_close(file, files); -					cond_resched(); -				} -			} -			i++; -			set >>= 1; -		} -	} -} - -struct files_struct *get_files_struct(struct task_struct *task) -{ -	struct files_struct *files; - -	task_lock(task); -	files = task->files; -	if (files) -		atomic_inc(&files->count); -	task_unlock(task); - -	return files; -} - -void put_files_struct(struct files_struct *files) -{ -	struct fdtable *fdt; - -	if (atomic_dec_and_test(&files->count)) { -		close_files(files); -		/* -		 * Free the fd and fdset arrays if we expanded them. -		 * If the fdtable was embedded, pass files for freeing -		 * at the end of the RCU grace period. Otherwise, -		 * you can free files immediately. -		 */ -		rcu_read_lock(); -		fdt = files_fdtable(files); -		if (fdt != &files->fdtab) -			kmem_cache_free(files_cachep, files); -		free_fdtable(fdt); -		rcu_read_unlock(); -	} -} - -void reset_files_struct(struct files_struct *files) -{ -	struct task_struct *tsk = current; -	struct files_struct *old; - -	old = tsk->files; -	task_lock(tsk); -	tsk->files = files; -	task_unlock(tsk); -	put_files_struct(old); -} - -void exit_files(struct task_struct *tsk) -{ -	struct files_struct * files = tsk->files; - -	if (files) { -		task_lock(tsk); -		tsk->files = NULL; -		task_unlock(tsk); -		put_files_struct(files); -	} -} -  #ifdef CONFIG_MM_OWNER  /*   * A task is exiting.   If it owned this mm, find a new owner for the mm. @@ -1046,6 +951,9 @@ void do_exit(long code)  	if (tsk->splice_pipe)  		__free_pipe_info(tsk->splice_pipe); +	if (tsk->task_frag.page) +		put_page(tsk->task_frag.page); +  	validate_creds_for_do_exit(tsk);  	preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index 5a0e74d89a5..8b20ab7d3aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  	tsk->btrace_seq = 0;  #endif  	tsk->splice_pipe = NULL; +	tsk->task_frag.page = NULL;  	account_kernel_stack(ti, 1); @@ -422,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  				mapping->i_mmap_writable++;  			flush_dcache_mmap_lock(mapping);  			/* insert tmp into the share list, just after mpnt */ -			vma_prio_tree_add(tmp, mpnt); +			if (unlikely(tmp->vm_flags & VM_NONLINEAR)) +				vma_nonlinear_insert(tmp, +						&mapping->i_mmap_nonlinear); +			else +				vma_interval_tree_insert_after(tmp, mpnt, +							&mapping->i_mmap);  			flush_dcache_mmap_unlock(mapping);  			mutex_unlock(&mapping->i_mmap_mutex);  		} @@ -621,26 +627,6 @@ void mmput(struct mm_struct *mm)  }  EXPORT_SYMBOL_GPL(mmput); -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ -	mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ -	mm->num_exe_file_vmas--; -	if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { -		fput(mm->exe_file); -		mm->exe_file = NULL; -	} - -} -  void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)  {  	if (new_exe_file) @@ -648,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)  	if (mm->exe_file)  		fput(mm->exe_file);  	mm->exe_file = new_exe_file; -	mm->num_exe_file_vmas = 0;  }  struct file *get_mm_exe_file(struct mm_struct *mm)  {  	struct file *exe_file; -	/* We need mmap_sem to protect against races with removal of -	 * VM_EXECUTABLE vmas */ +	/* We need mmap_sem to protect against races with removal of exe_file */  	down_read(&mm->mmap_sem);  	exe_file = mm->exe_file;  	if (exe_file) @@ -1077,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	init_rwsem(&sig->group_rwsem);  #endif -	sig->oom_adj = current->signal->oom_adj;  	sig->oom_score_adj = current->signal->oom_score_adj;  	sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1601,7 +1584,7 @@ long do_fork(unsigned long clone_flags,  	 * requested, no event is reported; otherwise, report if the event  	 * for the type of forking is enabled.  	 */ -	if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { +	if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) {  		if (clone_flags & CLONE_VFORK)  			trace = PTRACE_EVENT_VFORK;  		else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1651,6 +1634,17 @@ long do_fork(unsigned long clone_flags,  	return nr;  } +#ifdef CONFIG_GENERIC_KERNEL_THREAD +/* + * Create a kernel thread. + */ +pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ +	return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, +		(unsigned long)arg, NULL, NULL); +} +#endif +  #ifndef ARCH_MIN_MMSTRUCT_ALIGN  #define ARCH_MIN_MMSTRUCT_ALIGN 0  #endif diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 49a77727db4..4e69e24d3d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -148,7 +148,8 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,   * @host_data: Controller private data pointer   *   * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. + * domain otherwise. For the legacy domain, IRQ descriptors will also + * be allocated.   *   * This is intended to implement the expected behaviour for most   * interrupt controllers which is that a linear mapping should @@ -162,11 +163,33 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,  					 const struct irq_domain_ops *ops,  					 void *host_data)  { -	if (first_irq > 0) -		return irq_domain_add_legacy(of_node, size, first_irq, 0, +	if (first_irq > 0) { +		int irq_base; + +		if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { +			/* +			 * Set the descriptor allocator to search for a +			 * 1-to-1 mapping, such as irq_alloc_desc_at(). +			 * Use of_node_to_nid() which is defined to +			 * numa_node_id() on platforms that have no custom +			 * implementation. +			 */ +			irq_base = irq_alloc_descs(first_irq, first_irq, size, +						   of_node_to_nid(of_node)); +			if (irq_base < 0) { +				WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", +				     first_irq); +				irq_base = first_irq; +			} +		} else +			irq_base = first_irq; + +		return irq_domain_add_legacy(of_node, size, irq_base, 0,  					     ops, host_data); -	else -		return irq_domain_add_linear(of_node, size, ops, host_data); +	} + +	/* A linear domain is the default */ +	return irq_domain_add_linear(of_node, size, ops, host_data);  }  /** diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5e..60f48fa0fd0 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key,  	key->timeout = rl;  	INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);  } +EXPORT_SYMBOL_GPL(jump_label_rate_limit);  static int addr_conflict(struct jump_entry *entry, void *start, void *end)  { diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d641..5e4bd7864c5 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -21,7 +21,6 @@  #include <linux/hardirq.h>  #include <linux/elf.h>  #include <linux/elfcore.h> -#include <generated/utsrelease.h>  #include <linux/utsname.h>  #include <linux/numa.h>  #include <linux/suspend.h> diff --git a/kernel/kmod.c b/kernel/kmod.c index 6f99aead66c..1c317e38683 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -37,6 +37,7 @@  #include <linux/notifier.h>  #include <linux/suspend.h>  #include <linux/rwsem.h> +#include <linux/ptrace.h>  #include <asm/uaccess.h>  #include <trace/events/module.h> @@ -221,11 +222,13 @@ static int ____call_usermodehelper(void *data)  	retval = kernel_execve(sub_info->path,  			       (const char *const *)sub_info->argv,  			       (const char *const *)sub_info->envp); +	if (!retval) +		return 0;  	/* Exec failed? */  fail:  	sub_info->retval = retval; -	return 0; +	do_exit(0);  }  static int call_helper(void *data) @@ -292,7 +295,7 @@ static int wait_for_helper(void *data)  	}  	umh_complete(sub_info); -	return 0; +	do_exit(0);  }  /* This is run by khelper thread  */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 146a6fa9682..29fb60caecb 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@  #include <linux/mutex.h>  #include <linux/slab.h>  #include <linux/freezer.h> +#include <linux/ptrace.h>  #include <trace/events/sched.h>  static DEFINE_SPINLOCK(kthread_create_lock); diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c new file mode 100644 index 00000000000..4646eb2c382 --- /dev/null +++ b/kernel/modsign_pubkey.c @@ -0,0 +1,113 @@ +/* Public keys for module signature verification + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +struct key *modsign_keyring; + +extern __initdata const u8 modsign_certificate_list[]; +extern __initdata const u8 modsign_certificate_list_end[]; +asm(".section .init.data,\"aw\"\n" +    "modsign_certificate_list:\n" +    ".incbin \"signing_key.x509\"\n" +    ".incbin \"extra_certificates\"\n" +    "modsign_certificate_list_end:" +    ); + +/* + * We need to make sure ccache doesn't cache the .o file as it doesn't notice + * if modsign.pub changes. + */ +static __initdata const char annoy_ccache[] = __TIME__ "foo"; + +/* + * Load the compiled-in keys + */ +static __init int module_verify_init(void) +{ +	pr_notice("Initialise module verification\n"); + +	modsign_keyring = key_alloc(&key_type_keyring, ".module_sign", +				    KUIDT_INIT(0), KGIDT_INIT(0), +				    current_cred(), +				    (KEY_POS_ALL & ~KEY_POS_SETATTR) | +				    KEY_USR_VIEW | KEY_USR_READ, +				    KEY_ALLOC_NOT_IN_QUOTA); +	if (IS_ERR(modsign_keyring)) +		panic("Can't allocate module signing keyring\n"); + +	if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0) +		panic("Can't instantiate module signing keyring\n"); + +	return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(module_verify_init); + +/* + * Load the compiled-in keys + */ +static __init int load_module_signing_keys(void) +{ +	key_ref_t key; +	const u8 *p, *end; +	size_t plen; + +	pr_notice("Loading module verification certificates\n"); + +	end = modsign_certificate_list_end; +	p = modsign_certificate_list; +	while (p < end) { +		/* Each cert begins with an ASN.1 SEQUENCE tag and must be more +		 * than 256 bytes in size. +		 */ +		if (end - p < 4) +			goto dodgy_cert; +		if (p[0] != 0x30 && +		    p[1] != 0x82) +			goto dodgy_cert; +		plen = (p[2] << 8) | p[3]; +		plen += 4; +		if (plen > end - p) +			goto dodgy_cert; + +		key = key_create_or_update(make_key_ref(modsign_keyring, 1), +					   "asymmetric", +					   NULL, +					   p, +					   plen, +					   (KEY_POS_ALL & ~KEY_POS_SETATTR) | +					   KEY_USR_VIEW, +					   KEY_ALLOC_NOT_IN_QUOTA); +		if (IS_ERR(key)) +			pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", +			       PTR_ERR(key)); +		else +			pr_notice("MODSIGN: Loaded cert '%s'\n", +				  key_ref_to_ptr(key)->description); +		p += plen; +	} + +	return 0; + +dodgy_cert: +	pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); +	return 0; +} +late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h new file mode 100644 index 00000000000..24f9247b7d0 --- /dev/null +++ b/kernel/module-internal.h @@ -0,0 +1,14 @@ +/* Module internals + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +extern struct key *modsign_keyring; + +extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index 4edbd9c11ac..6e48c3a4359 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -58,6 +58,8 @@  #include <linux/jump_label.h>  #include <linux/pfn.h>  #include <linux/bsearch.h> +#include <linux/fips.h> +#include "module-internal.h"  #define CREATE_TRACE_POINTS  #include <trace/events/module.h> @@ -102,6 +104,43 @@ static LIST_HEAD(modules);  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */  #endif /* CONFIG_KGDB_KDB */ +#ifdef CONFIG_MODULE_SIG +#ifdef CONFIG_MODULE_SIG_FORCE +static bool sig_enforce = true; +#else +static bool sig_enforce = false; + +static int param_set_bool_enable_only(const char *val, +				      const struct kernel_param *kp) +{ +	int err; +	bool test; +	struct kernel_param dummy_kp = *kp; + +	dummy_kp.arg = &test; + +	err = param_set_bool(val, &dummy_kp); +	if (err) +		return err; + +	/* Don't let them unset it once it's set! */ +	if (!test && sig_enforce) +		return -EROFS; + +	if (test) +		sig_enforce = true; +	return 0; +} + +static const struct kernel_param_ops param_ops_bool_enable_only = { +	.set = param_set_bool_enable_only, +	.get = param_get_bool, +}; +#define param_check_bool_enable_only param_check_bool + +module_param(sig_enforce, bool_enable_only, 0644); +#endif /* !CONFIG_MODULE_SIG_FORCE */ +#endif /* CONFIG_MODULE_SIG */  /* Block module loading/unloading? */  int modules_disabled = 0; @@ -136,6 +175,7 @@ struct load_info {  	unsigned long symoffs, stroffs;  	struct _ddebug *debug;  	unsigned int num_debug; +	bool sig_ok;  	struct {  		unsigned int sym, str, mod, vers, info, pcpu;  	} index; @@ -1949,26 +1989,6 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)  	return ret;  } -int __weak apply_relocate(Elf_Shdr *sechdrs, -			  const char *strtab, -			  unsigned int symindex, -			  unsigned int relsec, -			  struct module *me) -{ -	pr_err("module %s: REL relocation unsupported\n", me->name); -	return -ENOEXEC; -} - -int __weak apply_relocate_add(Elf_Shdr *sechdrs, -			      const char *strtab, -			      unsigned int symindex, -			      unsigned int relsec, -			      struct module *me) -{ -	pr_err("module %s: RELA relocation unsupported\n", me->name); -	return -ENOEXEC; -} -  static int apply_relocations(struct module *mod, const struct load_info *info)  {  	unsigned int i; @@ -2273,12 +2293,17 @@ static void layout_symtab(struct module *mod, struct load_info *info)  	src = (void *)info->hdr + symsect->sh_offset;  	nsrc = symsect->sh_size / sizeof(*src); +	/* strtab always starts with a nul, so offset 0 is the empty string. */ +	strtab_size = 1; +  	/* Compute total space required for the core symbols' strtab. */ -	for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) -		if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { -			strtab_size += strlen(&info->strtab[src->st_name]) + 1; +	for (ndst = i = 0; i < nsrc; i++) { +		if (i == 0 || +		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { +			strtab_size += strlen(&info->strtab[src[i].st_name])+1;  			ndst++;  		} +	}  	/* Append room for core symbols at end of core part. */  	info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); @@ -2312,15 +2337,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)  	mod->core_symtab = dst = mod->module_core + info->symoffs;  	mod->core_strtab = s = mod->module_core + info->stroffs;  	src = mod->symtab; -	*dst = *src;  	*s++ = 0; -	for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { -		if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) -			continue; - -		dst[ndst] = *src; -		dst[ndst++].st_name = s - mod->core_strtab; -		s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; +	for (ndst = i = 0; i < mod->num_symtab; i++) { +		if (i == 0 || +		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { +			dst[ndst] = src[i]; +			dst[ndst++].st_name = s - mod->core_strtab; +			s += strlcpy(s, &mod->strtab[src[i].st_name], +				     KSYM_NAME_LEN) + 1; +		}  	}  	mod->core_num_syms = ndst;  } @@ -2399,7 +2424,44 @@ static inline void kmemleak_load_module(const struct module *mod,  }  #endif -/* Sets info->hdr and info->len. */ +#ifdef CONFIG_MODULE_SIG +static int module_sig_check(struct load_info *info, +			    const void *mod, unsigned long *_len) +{ +	int err = -ENOKEY; +	unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; +	unsigned long len = *_len; + +	if (len > markerlen && +	    memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { +		/* We truncate the module to discard the signature */ +		*_len -= markerlen; +		err = mod_verify_sig(mod, _len); +	} + +	if (!err) { +		info->sig_ok = true; +		return 0; +	} + +	/* Not having a signature is only an error if we're strict. */ +	if (err < 0 && fips_enabled) +		panic("Module verification failed with error %d in FIPS mode\n", +		      err); +	if (err == -ENOKEY && !sig_enforce) +		err = 0; + +	return err; +} +#else /* !CONFIG_MODULE_SIG */ +static int module_sig_check(struct load_info *info, +			    void *mod, unsigned long *len) +{ +	return 0; +} +#endif /* !CONFIG_MODULE_SIG */ + +/* Sets info->hdr, info->len and info->sig_ok. */  static int copy_and_check(struct load_info *info,  			  const void __user *umod, unsigned long len,  			  const char __user *uargs) @@ -2419,6 +2481,10 @@ static int copy_and_check(struct load_info *info,  		goto free_hdr;  	} +	err = module_sig_check(info, hdr, &len); +	if (err) +		goto free_hdr; +  	/* Sanity checks against insmoding binaries or wrong arch,  	   weird elf version */  	if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 @@ -2730,6 +2796,10 @@ static int check_module_license_and_versions(struct module *mod)  	if (strcmp(mod->name, "driverloader") == 0)  		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +	/* lve claims to be GPL but upstream won't provide source */ +	if (strcmp(mod->name, "lve") == 0) +		add_taint_module(mod, TAINT_PROPRIETARY_MODULE); +  #ifdef CONFIG_MODVERSIONS  	if ((mod->num_syms && !mod->crcs)  	    || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2861,6 +2931,20 @@ static int post_relocation(struct module *mod, const struct load_info *info)  	return module_finalize(info->hdr, info->sechdrs, mod);  } +/* Is this module of this name done loading?  No locks held. */ +static bool finished_loading(const char *name) +{ +	struct module *mod; +	bool ret; + +	mutex_lock(&module_mutex); +	mod = find_module(name); +	ret = !mod || mod->state != MODULE_STATE_COMING; +	mutex_unlock(&module_mutex); + +	return ret; +} +  /* Allocate and load the module: note that size of section 0 is always     zero, and we rely on this for optional sections. */  static struct module *load_module(void __user *umod, @@ -2868,7 +2952,7 @@ static struct module *load_module(void __user *umod,  				  const char __user *uargs)  {  	struct load_info info = { NULL, }; -	struct module *mod; +	struct module *mod, *old;  	long err;  	pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2886,6 +2970,12 @@ static struct module *load_module(void __user *umod,  		goto free_copy;  	} +#ifdef CONFIG_MODULE_SIG +	mod->sig_ok = info.sig_ok; +	if (!mod->sig_ok) +		add_taint_module(mod, TAINT_FORCED_MODULE); +#endif +  	/* Now module is in final location, initialize linked lists, etc. */  	err = module_unload_init(mod);  	if (err) @@ -2934,8 +3024,18 @@ static struct module *load_module(void __user *umod,  	 * function to insert in a way safe to concurrent readers.  	 * The mutex protects against concurrent writers.  	 */ +again:  	mutex_lock(&module_mutex); -	if (find_module(mod->name)) { +	if ((old = find_module(mod->name)) != NULL) { +		if (old->state == MODULE_STATE_COMING) { +			/* Wait in case it fails to load. */ +			mutex_unlock(&module_mutex); +			err = wait_event_interruptible(module_wq, +					       finished_loading(mod->name)); +			if (err) +				goto free_arch_cleanup; +			goto again; +		}  		err = -EEXIST;  		goto unlock;  	} @@ -2975,7 +3075,7 @@ static struct module *load_module(void __user *umod,  	/* Unlink carefully: kallsyms could be walking list. */  	list_del_rcu(&mod->list);  	module_bug_cleanup(mod); - +	wake_up_all(&module_wq);   ddebug:  	dynamic_debug_remove(info.debug);   unlock: @@ -3050,7 +3150,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  		blocking_notifier_call_chain(&module_notify_list,  					     MODULE_STATE_GOING, mod);  		free_module(mod); -		wake_up(&module_wq); +		wake_up_all(&module_wq);  		return ret;  	}  	if (ret > 0) { @@ -3062,9 +3162,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  		dump_stack();  	} -	/* Now it's a first class citizen!  Wake up anyone waiting for it. */ +	/* Now it's a first class citizen! */  	mod->state = MODULE_STATE_LIVE; -	wake_up(&module_wq);  	blocking_notifier_call_chain(&module_notify_list,  				     MODULE_STATE_LIVE, mod); @@ -3087,6 +3186,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,  	mod->init_ro_size = 0;  	mod->init_text_size = 0;  	mutex_unlock(&module_mutex); +	wake_up_all(&module_wq);  	return 0;  } diff --git a/kernel/module_signing.c b/kernel/module_signing.c new file mode 100644 index 00000000000..ea1b1df5dbb --- /dev/null +++ b/kernel/module_signing.c @@ -0,0 +1,249 @@ +/* Module signature checker + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/err.h> +#include <crypto/public_key.h> +#include <crypto/hash.h> +#include <keys/asymmetric-type.h> +#include "module-internal.h" + +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + *	- Signer's name + *	- Key identifier + *	- Signature data + *	- Information block + */ +struct module_signature { +	enum pkey_algo		algo : 8;	/* Public-key crypto algorithm */ +	enum pkey_hash_algo	hash : 8;	/* Digest algorithm */ +	enum pkey_id_type	id_type : 8;	/* Key identifier type */ +	u8			signer_len;	/* Length of signer's name */ +	u8			key_id_len;	/* Length of key identifier */ +	u8			__pad[3]; +	__be32			sig_len;	/* Length of signature data */ +}; + +/* + * Digest the module contents. + */ +static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +						    const void *mod, +						    unsigned long modlen) +{ +	struct public_key_signature *pks; +	struct crypto_shash *tfm; +	struct shash_desc *desc; +	size_t digest_size, desc_size; +	int ret; + +	pr_devel("==>%s()\n", __func__); +	 +	/* Allocate the hashing algorithm we're going to need and find out how +	 * big the hash operational data will be. +	 */ +	tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); +	if (IS_ERR(tfm)) +		return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); + +	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); +	digest_size = crypto_shash_digestsize(tfm); + +	/* We allocate the hash operational data storage on the end of our +	 * context data and the digest output buffer on the end of that. +	 */ +	ret = -ENOMEM; +	pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL); +	if (!pks) +		goto error_no_pks; + +	pks->pkey_hash_algo	= hash; +	pks->digest		= (u8 *)pks + sizeof(*pks) + desc_size; +	pks->digest_size	= digest_size; + +	desc = (void *)pks + sizeof(*pks); +	desc->tfm   = tfm; +	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + +	ret = crypto_shash_init(desc); +	if (ret < 0) +		goto error; + +	ret = crypto_shash_finup(desc, mod, modlen, pks->digest); +	if (ret < 0) +		goto error; + +	crypto_free_shash(tfm); +	pr_devel("<==%s() = ok\n", __func__); +	return pks; + +error: +	kfree(pks); +error_no_pks: +	crypto_free_shash(tfm); +	pr_devel("<==%s() = %d\n", __func__, ret); +	return ERR_PTR(ret); +} + +/* + * Extract an MPI array from the signature data.  This represents the actual + * signature.  Each raw MPI is prefaced by a BE 2-byte value indicating the + * size of the MPI in bytes. + * + * RSA signatures only have one MPI, so currently we only read one. + */ +static int mod_extract_mpi_array(struct public_key_signature *pks, +				 const void *data, size_t len) +{ +	size_t nbytes; +	MPI mpi; + +	if (len < 3) +		return -EBADMSG; +	nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1]; +	data += 2; +	len -= 2; +	if (len != nbytes) +		return -EBADMSG; + +	mpi = mpi_read_raw_data(data, nbytes); +	if (!mpi) +		return -ENOMEM; +	pks->mpi[0] = mpi; +	pks->nr_mpi = 1; +	return 0; +} + +/* + * Request an asymmetric key. + */ +static struct key *request_asymmetric_key(const char *signer, size_t signer_len, +					  const u8 *key_id, size_t key_id_len) +{ +	key_ref_t key; +	size_t i; +	char *id, *q; + +	pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len); + +	/* Construct an identifier. */ +	id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL); +	if (!id) +		return ERR_PTR(-ENOKEY); + +	memcpy(id, signer, signer_len); + +	q = id + signer_len; +	*q++ = ':'; +	*q++ = ' '; +	for (i = 0; i < key_id_len; i++) { +		*q++ = hex_asc[*key_id >> 4]; +		*q++ = hex_asc[*key_id++ & 0x0f]; +	} + +	*q = 0; + +	pr_debug("Look up: \"%s\"\n", id); + +	key = keyring_search(make_key_ref(modsign_keyring, 1), +			     &key_type_asymmetric, id); +	if (IS_ERR(key)) +		pr_warn("Request for unknown module key '%s' err %ld\n", +			id, PTR_ERR(key)); +	kfree(id); + +	if (IS_ERR(key)) { +		switch (PTR_ERR(key)) { +			/* Hide some search errors */ +		case -EACCES: +		case -ENOTDIR: +		case -EAGAIN: +			return ERR_PTR(-ENOKEY); +		default: +			return ERR_CAST(key); +		} +	} + +	pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key))); +	return key_ref_to_ptr(key); +} + +/* + * Verify the signature on a module. + */ +int mod_verify_sig(const void *mod, unsigned long *_modlen) +{ +	struct public_key_signature *pks; +	struct module_signature ms; +	struct key *key; +	const void *sig; +	size_t modlen = *_modlen, sig_len; +	int ret; + +	pr_devel("==>%s(,%zu)\n", __func__, modlen); + +	if (modlen <= sizeof(ms)) +		return -EBADMSG; + +	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms)); +	modlen -= sizeof(ms); + +	sig_len = be32_to_cpu(ms.sig_len); +	if (sig_len >= modlen) +		return -EBADMSG; +	modlen -= sig_len; +	if ((size_t)ms.signer_len + ms.key_id_len >= modlen) +		return -EBADMSG; +	modlen -= (size_t)ms.signer_len + ms.key_id_len; + +	*_modlen = modlen; +	sig = mod + modlen; + +	/* For the moment, only support RSA and X.509 identifiers */ +	if (ms.algo != PKEY_ALGO_RSA || +	    ms.id_type != PKEY_ID_X509) +		return -ENOPKG; + +	if (ms.hash >= PKEY_HASH__LAST || +	    !pkey_hash_algo[ms.hash]) +		return -ENOPKG; + +	key = request_asymmetric_key(sig, ms.signer_len, +				     sig + ms.signer_len, ms.key_id_len); +	if (IS_ERR(key)) +		return PTR_ERR(key); + +	pks = mod_make_digest(ms.hash, mod, modlen); +	if (IS_ERR(pks)) { +		ret = PTR_ERR(pks); +		goto error_put_key; +	} + +	ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, +				    sig_len); +	if (ret < 0) +		goto error_free_pks; + +	ret = verify_signature(key, pks); +	pr_devel("verify_signature() = %d\n", ret); + +error_free_pks: +	mpi_free(pks->rsa.s); +	kfree(pks); +error_put_key: +	key_put(key); +	pr_devel("<==%s() = %d\n", __func__, ret); +	return ret;	 +} diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad83..aebd4f5aaf4 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)  	}  	return nr;  } +EXPORT_SYMBOL_GPL(pid_nr_ns);  pid_t pid_vnr(struct pid *pid)  { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6144bab8fd8..7b07cc0dfb7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -16,6 +16,7 @@  #include <linux/slab.h>  #include <linux/proc_fs.h>  #include <linux/reboot.h> +#include <linux/export.h>  #define BITS_PER_PAGE		(PAGE_SIZE*8) @@ -70,12 +71,22 @@ err_alloc:  	return NULL;  } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 +  static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)  {  	struct pid_namespace *ns;  	unsigned int level = parent_pid_ns->level + 1; -	int i, err = -ENOMEM; +	int i; +	int err; + +	if (level > MAX_PID_NS_LEVEL) { +		err = -EINVAL; +		goto out; +	} +	err = -ENOMEM;  	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);  	if (ns == NULL)  		goto out; @@ -132,18 +143,26 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old  	return create_pid_namespace(old_ns);  } -void free_pid_ns(struct kref *kref) +static void free_pid_ns(struct kref *kref)  { -	struct pid_namespace *ns, *parent; +	struct pid_namespace *ns;  	ns = container_of(kref, struct pid_namespace, kref); - -	parent = ns->parent;  	destroy_pid_namespace(ns); +} + +void put_pid_ns(struct pid_namespace *ns) +{ +	struct pid_namespace *parent; -	if (parent != NULL) -		put_pid_ns(parent); +	while (ns != &init_pid_ns) { +		parent = ns->parent; +		if (!kref_put(&ns->kref, free_pid_ns)) +			break; +		ns = parent; +	}  } +EXPORT_SYMBOL_GPL(put_pid_ns);  void zap_pid_ns_processes(struct pid_namespace *pid_ns)  { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82..5dfdc9ea180 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS  	bool  	depends on PM +config PM_GENERIC_DOMAINS_SLEEP +	def_bool y +	depends on PM_SLEEP && PM_GENERIC_DOMAINS +  config PM_GENERIC_DOMAINS_RUNTIME  	def_bool y  	depends on PM_RUNTIME && PM_GENERIC_DOMAINS diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e8..68197a4e8fc 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -37,7 +37,7 @@ static struct sysrq_key_op	sysrq_poweroff_op = {  	.enable_mask	= SYSRQ_ENABLE_BOOT,  }; -static int pm_sysrq_init(void) +static int __init pm_sysrq_init(void)  {  	register_sysrq_key('o', &sysrq_poweroff_op);  	return 0; diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f6755..87da817f9e1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only)  		/*  		 * We need to retry, but first give the freezing tasks some -		 * time to enter the regrigerator. +		 * time to enter the refrigerator.  		 */  		msleep(10);  	} diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e68402..846bd42c7ed 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)  	default:  		/* runtime check for not using enum */  		BUG(); +		return PM_QOS_DEFAULT_VALUE;  	}  } diff --git a/kernel/printk.c b/kernel/printk.c index 66a2ea37b57..2d607f4d179 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1890,7 +1890,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,  	switch (action) {  	case CPU_ONLINE:  	case CPU_DEAD: -	case CPU_DYING:  	case CPU_DOWN_FAILED:  	case CPU_UP_CANCELED:  		console_lock(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93..1f5e55dda95 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)  		return has_ns_capability(current, ns, CAP_SYS_PTRACE);  } -int __ptrace_may_access(struct task_struct *task, unsigned int mode) +/* Returns 0 on success, -errno on denial. */ +static int __ptrace_may_access(struct task_struct *task, unsigned int mode)  {  	const struct cred *cred = current_cred(), *tcred; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf0..74df86bd920 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];  	.orphan_nxttail = &sname##_state.orphan_nxtlist, \  	.orphan_donetail = &sname##_state.orphan_donelist, \  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ +	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \  	.name = #sname, \  } @@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  	raw_spin_unlock_irq(&rnp->lock);  	/* Exclude any concurrent CPU-hotplug operations. */ -	get_online_cpus(); +	mutex_lock(&rsp->onoff_mutex);  	/*  	 * Set the quiescent-state-needed bits in all the rcu_node @@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  		cond_resched();  	} -	put_online_cpus(); +	mutex_unlock(&rsp->onoff_mutex);  	return 1;  } @@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */  	/* Exclude any attempts to start a new grace period. */ +	mutex_lock(&rsp->onoff_mutex);  	raw_spin_lock_irqsave(&rsp->onofflock, flags);  	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ @@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  	init_callback_list(rdp);  	/* Disallow further callbacks on this CPU. */  	rdp->nxttail[RCU_NEXT_TAIL] = NULL; +	mutex_unlock(&rsp->onoff_mutex);  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rcu_get_root(rsp); +	/* Exclude new grace periods. */ +	mutex_lock(&rsp->onoff_mutex); +  	/* Set up local state, ensuring consistent view of global state. */  	raw_spin_lock_irqsave(&rnp->lock, flags);  	rdp->beenonline = 1;	 /* We have now been online. */ @@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	rcu_prepare_for_idle_init(cpu);  	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */ -	/* -	 * A new grace period might start here.  If so, we won't be part -	 * of it, but that is OK, as we are currently in a quiescent state. -	 */ - -	/* Exclude any attempts to start a new GP on large systems. */ -	raw_spin_lock(&rsp->onofflock);		/* irqs already disabled. */ -  	/* Add CPU to rcu_node bitmasks. */  	rnp = rdp->mynode;  	mask = rdp->grpmask; @@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */  		rnp = rnp->parent;  	} while (rnp != NULL && !(rnp->qsmaskinit & mask)); +	local_irq_restore(flags); -	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); +	mutex_unlock(&rsp->onoff_mutex);  }  static void __cpuinit rcu_prepare_cpu(int cpu) diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d6832..a240f032848 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -394,11 +394,17 @@ struct rcu_state {  	struct rcu_head **orphan_donetail;	/* Tail of above. */  	long qlen_lazy;				/* Number of lazy callbacks. */  	long qlen;				/* Total number of callbacks. */ +	/* End of fields guarded by onofflock. */ + +	struct mutex onoff_mutex;		/* Coordinate hotplug & GPs. */ +  	struct mutex barrier_mutex;		/* Guards barrier fields. */  	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */  	struct completion barrier_completion;	/* Wake at barrier end. */  	unsigned long n_barrier_done;		/* ++ at start and end of */  						/*  _rcu_barrier(). */ +	/* End of fields guarded by barrier_mutex. */ +  	unsigned long jiffies_force_qs;		/* Time at which to invoke */  						/*  force_quiescent_state(). */  	unsigned long n_force_qs;		/* Number of calls to */ diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee8..73f35d4b30b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root,  	struct resource *parent = root;  	struct resource *conflict;  	struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); +	struct resource *next_res = NULL;  	if (!res)  		return; @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root,  	res->end = end;  	res->flags = IORESOURCE_BUSY; -	conflict = __request_resource(parent, res); -	if (!conflict) -		return; +	while (1) { -	/* failed, split and try again */ -	kfree(res); +		conflict = __request_resource(parent, res); +		if (!conflict) { +			if (!next_res) +				break; +			res = next_res; +			next_res = NULL; +			continue; +		} -	/* conflict covered whole area */ -	if (conflict->start <= start && conflict->end >= end) -		return; +		/* conflict covered whole area */ +		if (conflict->start <= res->start && +				conflict->end >= res->end) { +			kfree(res); +			WARN_ON(next_res); +			break; +		} + +		/* failed, split and try again */ +		if (conflict->start > res->start) { +			end = res->end; +			res->end = conflict->start - 1; +			if (conflict->end < end) { +				next_res = kzalloc(sizeof(*next_res), +						GFP_ATOMIC); +				if (!next_res) { +					kfree(res); +					break; +				} +				next_res->name = name; +				next_res->start = conflict->end + 1; +				next_res->end = end; +				next_res->flags = IORESOURCE_BUSY; +			} +		} else { +			res->start = conflict->end + 1; +		} +	} -	if (conflict->start > start) -		__reserve_region_with_split(root, start, conflict->start-1, name); -	if (conflict->end < end) -		__reserve_region_with_split(root, conflict->end+1, end, name);  }  void __init reserve_region_with_split(struct resource *root, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8322d73b439..5dae0d252ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -505,7 +505,7 @@ static inline void init_hrtick(void)  #ifdef CONFIG_SMP  #ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#define tsk_is_polling(t) 0  #endif  void resched_task(struct task_struct *p) @@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	trace_sched_migrate_task(p, new_cpu);  	if (task_cpu(p) != new_cpu) { +		if (p->sched_class->migrate_task_rq) +			p->sched_class->migrate_task_rq(p, new_cpu);  		p->se.nr_migrations++;  		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);  	} @@ -1524,6 +1526,15 @@ static void __sched_fork(struct task_struct *p)  	p->se.vruntime			= 0;  	INIT_LIST_HEAD(&p->se.group_node); +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +	p->se.avg.runnable_avg_period = 0; +	p->se.avg.runnable_avg_sum = 0; +#endif  #ifdef CONFIG_SCHEDSTATS  	memset(&p->se.statistics, 0, sizeof(p->se.statistics));  #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 81b763ba58a..8d859dae5be 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);   * Called before incrementing preempt_count on {soft,}irq_enter   * and before decrementing preempt_count on {soft,}irq_exit.   */ -void vtime_account(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr)  {  	unsigned long flags;  	s64 delta; @@ -73,7 +73,7 @@ void vtime_account(struct task_struct *curr)  	irq_time_write_end();  	local_irq_restore(flags);  } -EXPORT_SYMBOL_GPL(vtime_account); +EXPORT_SYMBOL_GPL(irqtime_account_irq);  static int irqtime_account_hi_update(void)  { @@ -433,10 +433,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)  	*st = cputime.stime;  } +void vtime_account_system(struct task_struct *tsk) +{ +	unsigned long flags; + +	local_irq_save(flags); +	__vtime_account_system(tsk); +	local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(vtime_account_system); +  /*   * Archs that account the whole time spent in the idle task   * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * __vtime_account_system() and __vtime_account_idle(). Archs that   * have other meaning of the idle time (s390 only includes the   * time spent by the CPU when it's in low power mode) must override   * vtime_account(). @@ -449,9 +459,9 @@ void vtime_account(struct task_struct *tsk)  	local_irq_save(flags);  	if (in_interrupt() || !is_idle_task(tsk)) -		vtime_account_system(tsk); +		__vtime_account_system(tsk);  	else -		vtime_account_idle(tsk); +		__vtime_account_idle(tsk);  	local_irq_restore(flags);  } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea..2cd3c1b4e58 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -61,14 +61,20 @@ static unsigned long nsec_low(unsigned long long nsec)  static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)  {  	struct sched_entity *se = tg->se[cpu]; -	if (!se) -		return;  #define P(F) \  	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)  #define PN(F) \  	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) +	if (!se) { +		struct sched_avg *avg = &cpu_rq(cpu)->avg; +		P(avg->runnable_avg_sum); +		P(avg->runnable_avg_period); +		return; +	} + +  	PN(se->exec_start);  	PN(se->vruntime);  	PN(se->sum_exec_runtime); @@ -85,6 +91,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group  	P(se->statistics.wait_count);  #endif  	P(se->load.weight); +#ifdef CONFIG_SMP +	P(se->avg.runnable_avg_sum); +	P(se->avg.runnable_avg_period); +	P(se->avg.load_avg_contrib); +	P(se->avg.decay_count); +#endif  #undef PN  #undef P  } @@ -206,14 +218,18 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);  #ifdef CONFIG_FAIR_GROUP_SCHED  #ifdef CONFIG_SMP -	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_avg", -			SPLIT_NS(cfs_rq->load_avg)); -	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "load_period", -			SPLIT_NS(cfs_rq->load_period)); -	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib", -			cfs_rq->load_contribution); -	SEQ_printf(m, "  .%-30s: %d\n", "load_tg", -			atomic_read(&cfs_rq->tg->load_weight)); +	SEQ_printf(m, "  .%-30s: %lld\n", "runnable_load_avg", +			cfs_rq->runnable_load_avg); +	SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg", +			cfs_rq->blocked_load_avg); +	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg", +			atomic64_read(&cfs_rq->tg->load_avg)); +	SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib", +			cfs_rq->tg_load_contrib); +	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib", +			cfs_rq->tg_runnable_contrib); +	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg", +			atomic_read(&cfs_rq->tg->runnable_avg));  #endif  	print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f936552b3db..59e072b2db9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)  	return grp->my_q;  } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, +				       int force_update); +  static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  {  	if (!cfs_rq->on_list) { @@ -278,6 +281,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)  		}  		cfs_rq->on_list = 1; +		/* We should have no load, but we need to update last_decay. */ +		update_cfs_rq_blocked_load(cfs_rq, 0);  	}  } @@ -653,9 +658,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  	return calc_delta_fair(sched_slice(cfs_rq, se), se);  } -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq); -  /*   * Update the current task's runtime statistics. Skip current tasks that   * are not in our scheduling class. @@ -675,10 +677,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,  	curr->vruntime += delta_exec_weighted;  	update_min_vruntime(cfs_rq); - -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -	cfs_rq->load_unacc_exec_time += delta_exec; -#endif  }  static void update_curr(struct cfs_rq *cfs_rq) @@ -801,72 +799,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_FAIR_GROUP_SCHED -/* we need this in update_cfs_load and load-balance functions below */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);  # ifdef CONFIG_SMP -static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, -					    int global_update) -{ -	struct task_group *tg = cfs_rq->tg; -	long load_avg; - -	load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); -	load_avg -= cfs_rq->load_contribution; - -	if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { -		atomic_add(load_avg, &tg->load_weight); -		cfs_rq->load_contribution += load_avg; -	} -} - -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -	u64 period = sysctl_sched_shares_window; -	u64 now, delta; -	unsigned long load = cfs_rq->load.weight; - -	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) -		return; - -	now = rq_of(cfs_rq)->clock_task; -	delta = now - cfs_rq->load_stamp; - -	/* truncate load history at 4 idle periods */ -	if (cfs_rq->load_stamp > cfs_rq->load_last && -	    now - cfs_rq->load_last > 4 * period) { -		cfs_rq->load_period = 0; -		cfs_rq->load_avg = 0; -		delta = period - 1; -	} - -	cfs_rq->load_stamp = now; -	cfs_rq->load_unacc_exec_time = 0; -	cfs_rq->load_period += delta; -	if (load) { -		cfs_rq->load_last = now; -		cfs_rq->load_avg += delta * load; -	} - -	/* consider updating load contribution on each fold or truncate */ -	if (global_update || cfs_rq->load_period > period -	    || !cfs_rq->load_period) -		update_cfs_rq_load_contribution(cfs_rq, global_update); - -	while (cfs_rq->load_period > period) { -		/* -		 * Inline assembly required to prevent the compiler -		 * optimising this loop into a divmod call. -		 * See __iter_div_u64_rem() for another example of this. -		 */ -		asm("" : "+rm" (cfs_rq->load_period)); -		cfs_rq->load_period /= 2; -		cfs_rq->load_avg /= 2; -	} - -	if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) -		list_del_leaf_cfs_rq(cfs_rq); -} -  static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  {  	long tg_weight; @@ -876,8 +809,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)  	 * to gain a more accurate current total weight. See  	 * update_cfs_rq_load_contribution().  	 */ -	tg_weight = atomic_read(&tg->load_weight); -	tg_weight -= cfs_rq->load_contribution; +	tg_weight = atomic64_read(&tg->load_avg); +	tg_weight -= cfs_rq->tg_load_contrib;  	tg_weight += cfs_rq->load.weight;  	return tg_weight; @@ -901,27 +834,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  	return shares;  } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); -	} -}  # else /* CONFIG_SMP */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) -{ -} -  static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)  {  	return tg->shares;  } - -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ -}  # endif /* CONFIG_SMP */  static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  			    unsigned long weight) @@ -939,6 +856,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,  		account_entity_enqueue(cfs_rq, se);  } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); +  static void update_cfs_shares(struct cfs_rq *cfs_rq)  {  	struct task_group *tg; @@ -958,18 +877,478 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)  	reweight_entity(cfs_rq_of(se), se, shares);  }  #else /* CONFIG_FAIR_GROUP_SCHED */ -static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq)  {  } +#endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) +/* + * We choose a half-life close to 1 scheduling period. + * Note: The tables below are dependent on this value. + */ +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ +#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ + +/* Precomputed fixed inverse multiplies for multiplication by y^n */ +static const u32 runnable_avg_yN_inv[] = { +	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, +	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, +	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, +	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, +	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, +	0x85aac367, 0x82cd8698, +}; + +/* + * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent + * over-estimates when re-combining. + */ +static const u32 runnable_avg_yN_sum[] = { +	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, +	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, +	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, +}; + +/* + * Approximate: + *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n)  { +	unsigned int local_n; + +	if (!n) +		return val; +	else if (unlikely(n > LOAD_AVG_PERIOD * 63)) +		return 0; + +	/* after bounds checking we can collapse to 32-bit */ +	local_n = n; + +	/* +	 * As y^PERIOD = 1/2, we can combine +	 *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) +	 * With a look-up table which covers k^n (n<PERIOD) +	 * +	 * To achieve constant time decay_load. +	 */ +	if (unlikely(local_n >= LOAD_AVG_PERIOD)) { +		val >>= local_n / LOAD_AVG_PERIOD; +		local_n %= LOAD_AVG_PERIOD; +	} + +	val *= runnable_avg_yN_inv[local_n]; +	/* We don't use SRR here since we always want to round down. */ +	return val >> 32;  } -static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +/* + * For updates fully spanning n periods, the contribution to runnable + * average will be: \Sum 1024*y^n + * + * We can compute this reasonably efficiently by combining: + *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD} + */ +static u32 __compute_runnable_contrib(u64 n)  { +	u32 contrib = 0; + +	if (likely(n <= LOAD_AVG_PERIOD)) +		return runnable_avg_yN_sum[n]; +	else if (unlikely(n >= LOAD_AVG_MAX_N)) +		return LOAD_AVG_MAX; + +	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */ +	do { +		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */ +		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD]; + +		n -= LOAD_AVG_PERIOD; +	} while (n > LOAD_AVG_PERIOD); + +	contrib = decay_load(contrib, n); +	return contrib + runnable_avg_yN_sum[n];  } -#endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series.  To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + *      p0            p1           p2 + *     (now)       (~1ms ago)  (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + *   y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, +							struct sched_avg *sa, +							int runnable) +{ +	u64 delta, periods; +	u32 runnable_contrib; +	int delta_w, decayed = 0; + +	delta = now - sa->last_runnable_update; +	/* +	 * This should only happen when time goes backwards, which it +	 * unfortunately does during sched clock init when we swap over to TSC. +	 */ +	if ((s64)delta < 0) { +		sa->last_runnable_update = now; +		return 0; +	} + +	/* +	 * Use 1024ns as the unit of measurement since it's a reasonable +	 * approximation of 1us and fast to compute. +	 */ +	delta >>= 10; +	if (!delta) +		return 0; +	sa->last_runnable_update = now; + +	/* delta_w is the amount already accumulated against our next period */ +	delta_w = sa->runnable_avg_period % 1024; +	if (delta + delta_w >= 1024) { +		/* period roll-over */ +		decayed = 1; + +		/* +		 * Now that we know we're crossing a period boundary, figure +		 * out how much from delta we need to complete the current +		 * period and accrue it. +		 */ +		delta_w = 1024 - delta_w; +		if (runnable) +			sa->runnable_avg_sum += delta_w; +		sa->runnable_avg_period += delta_w; + +		delta -= delta_w; + +		/* Figure out how many additional periods this update spans */ +		periods = delta / 1024; +		delta %= 1024; + +		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, +						  periods + 1); +		sa->runnable_avg_period = decay_load(sa->runnable_avg_period, +						     periods + 1); + +		/* Efficiently calculate \sum (1..n_period) 1024*y^i */ +		runnable_contrib = __compute_runnable_contrib(periods); +		if (runnable) +			sa->runnable_avg_sum += runnable_contrib; +		sa->runnable_avg_period += runnable_contrib; +	} + +	/* Remainder of delta accrued against u_0` */ +	if (runnable) +		sa->runnable_avg_sum += delta; +	sa->runnable_avg_period += delta; + +	return decayed; +} + +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline u64 __synchronize_entity_decay(struct sched_entity *se) +{ +	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	u64 decays = atomic64_read(&cfs_rq->decay_counter); + +	decays -= se->avg.decay_count; +	if (!decays) +		return 0; + +	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); +	se->avg.decay_count = 0; + +	return decays; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, +						 int force_update) +{ +	struct task_group *tg = cfs_rq->tg; +	s64 tg_contrib; + +	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; +	tg_contrib -= cfs_rq->tg_load_contrib; + +	if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) { +		atomic64_add(tg_contrib, &tg->load_avg); +		cfs_rq->tg_load_contrib += tg_contrib; +	} +} + +/* + * Aggregate cfs_rq runnable averages into an equivalent task_group + * representation for computing load contributions. + */ +static inline void __update_tg_runnable_avg(struct sched_avg *sa, +						  struct cfs_rq *cfs_rq) +{ +	struct task_group *tg = cfs_rq->tg; +	long contrib; + +	/* The fraction of a cpu used by this cfs_rq */ +	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, +			  sa->runnable_avg_period + 1); +	contrib -= cfs_rq->tg_runnable_contrib; + +	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { +		atomic_add(contrib, &tg->runnable_avg); +		cfs_rq->tg_runnable_contrib += contrib; +	} +} + +static inline void __update_group_entity_contrib(struct sched_entity *se) +{ +	struct cfs_rq *cfs_rq = group_cfs_rq(se); +	struct task_group *tg = cfs_rq->tg; +	int runnable_avg; + +	u64 contrib; + +	contrib = cfs_rq->tg_load_contrib * tg->shares; +	se->avg.load_avg_contrib = div64_u64(contrib, +					     atomic64_read(&tg->load_avg) + 1); + +	/* +	 * For group entities we need to compute a correction term in the case +	 * that they are consuming <1 cpu so that we would contribute the same +	 * load as a task of equal weight. +	 * +	 * Explicitly co-ordinating this measurement would be expensive, but +	 * fortunately the sum of each cpus contribution forms a usable +	 * lower-bound on the true value. +	 * +	 * Consider the aggregate of 2 contributions.  Either they are disjoint +	 * (and the sum represents true value) or they are disjoint and we are +	 * understating by the aggregate of their overlap. +	 * +	 * Extending this to N cpus, for a given overlap, the maximum amount we +	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of +	 * cpus that overlap for this interval and w_i is the interval width. +	 * +	 * On a small machine; the first term is well-bounded which bounds the +	 * total error since w_i is a subset of the period.  Whereas on a +	 * larger machine, while this first term can be larger, if w_i is the +	 * of consequential size guaranteed to see n_i*w_i quickly converge to +	 * our upper bound of 1-cpu. +	 */ +	runnable_avg = atomic_read(&tg->runnable_avg); +	if (runnable_avg < NICE_0_LOAD) { +		se->avg.load_avg_contrib *= runnable_avg; +		se->avg.load_avg_contrib >>= NICE_0_SHIFT; +	} +} +#else +static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, +						 int force_update) {} +static inline void __update_tg_runnable_avg(struct sched_avg *sa, +						  struct cfs_rq *cfs_rq) {} +static inline void __update_group_entity_contrib(struct sched_entity *se) {} +#endif + +static inline void __update_task_entity_contrib(struct sched_entity *se) +{ +	u32 contrib; + +	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ +	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); +	contrib /= (se->avg.runnable_avg_period + 1); +	se->avg.load_avg_contrib = scale_load(contrib); +} + +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ +	long old_contrib = se->avg.load_avg_contrib; + +	if (entity_is_task(se)) { +		__update_task_entity_contrib(se); +	} else { +		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); +		__update_group_entity_contrib(se); +	} + +	return se->avg.load_avg_contrib - old_contrib; +} + +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, +						 long load_contrib) +{ +	if (likely(load_contrib < cfs_rq->blocked_load_avg)) +		cfs_rq->blocked_load_avg -= load_contrib; +	else +		cfs_rq->blocked_load_avg = 0; +} + +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se, +					  int update_cfs_rq) +{ +	struct cfs_rq *cfs_rq = cfs_rq_of(se); +	long contrib_delta; +	u64 now; + +	/* +	 * For a group entity we need to use their owned cfs_rq_clock_task() in +	 * case they are the parent of a throttled hierarchy. +	 */ +	if (entity_is_task(se)) +		now = cfs_rq_clock_task(cfs_rq); +	else +		now = cfs_rq_clock_task(group_cfs_rq(se)); + +	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) +		return; + +	contrib_delta = __update_entity_load_avg_contrib(se); + +	if (!update_cfs_rq) +		return; + +	if (se->on_rq) +		cfs_rq->runnable_load_avg += contrib_delta; +	else +		subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) +{ +	u64 now = cfs_rq_clock_task(cfs_rq) >> 20; +	u64 decays; + +	decays = now - cfs_rq->last_decay; +	if (!decays && !force_update) +		return; + +	if (atomic64_read(&cfs_rq->removed_load)) { +		u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0); +		subtract_blocked_load_contrib(cfs_rq, removed_load); +	} + +	if (decays) { +		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, +						      decays); +		atomic64_add(decays, &cfs_rq->decay_counter); +		cfs_rq->last_decay = now; +	} + +	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update); +	update_cfs_shares(cfs_rq); +} + +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) +{ +	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); +	__update_tg_runnable_avg(&rq->avg, &rq->cfs); +} + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, +						  struct sched_entity *se, +						  int wakeup) +{ +	/* +	 * We track migrations using entity decay_count <= 0, on a wake-up +	 * migration we use a negative decay count to track the remote decays +	 * accumulated while sleeping. +	 */ +	if (unlikely(se->avg.decay_count <= 0)) { +		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; +		if (se->avg.decay_count) { +			/* +			 * In a wake-up migration we have to approximate the +			 * time sleeping.  This is because we can't synchronize +			 * clock_task between the two cpus, and it is not +			 * guaranteed to be read-safe.  Instead, we can +			 * approximate this using our carried decays, which are +			 * explicitly atomically readable. +			 */ +			se->avg.last_runnable_update -= (-se->avg.decay_count) +							<< 20; +			update_entity_load_avg(se, 0); +			/* Indicate that we're now synchronized and on-rq */ +			se->avg.decay_count = 0; +		} +		wakeup = 0; +	} else { +		__synchronize_entity_decay(se); +	} + +	/* migrated tasks did not contribute to our blocked load */ +	if (wakeup) { +		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); +		update_entity_load_avg(se, 0); +	} + +	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +	/* we force update consideration on load-balancer moves */ +	update_cfs_rq_blocked_load(cfs_rq, !wakeup); +} + +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, +						  struct sched_entity *se, +						  int sleep) +{ +	update_entity_load_avg(se, 1); +	/* we force update consideration on load-balancer moves */ +	update_cfs_rq_blocked_load(cfs_rq, !sleep); + +	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +	if (sleep) { +		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; +		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); +	} /* migrations, e.g. sleep=0 leave decay_count == 0 */ +} +#else +static inline void update_entity_load_avg(struct sched_entity *se, +					  int update_cfs_rq) {} +static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, +					   struct sched_entity *se, +					   int wakeup) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, +					   struct sched_entity *se, +					   int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, +					      int force_update) {} +#endif  static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)  { @@ -1096,9 +1475,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	 * Update run-time statistics of the 'current'.  	 */  	update_curr(cfs_rq); -	update_cfs_load(cfs_rq, 0);  	account_entity_enqueue(cfs_rq, se); -	update_cfs_shares(cfs_rq); +	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);  	if (flags & ENQUEUE_WAKEUP) {  		place_entity(cfs_rq, se, 0); @@ -1190,9 +1568,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	if (se != cfs_rq->curr)  		__dequeue_entity(cfs_rq, se); -	se->on_rq = 0; -	update_cfs_load(cfs_rq, 0);  	account_entity_dequeue(cfs_rq, se); +	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);  	/*  	 * Normalize the entity after updating the min_vruntime because the @@ -1206,7 +1583,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  	return_cfs_rq_runtime(cfs_rq);  	update_min_vruntime(cfs_rq); -	update_cfs_shares(cfs_rq); +	se->on_rq = 0;  }  /* @@ -1340,6 +1717,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)  		update_stats_wait_start(cfs_rq, prev);  		/* Put 'current' back into the tree. */  		__enqueue_entity(cfs_rq, prev); +		/* in !on_rq case, update occurred at dequeue */ +		update_entity_load_avg(prev, 1);  	}  	cfs_rq->curr = NULL;  } @@ -1353,9 +1732,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  	update_curr(cfs_rq);  	/* -	 * Update share accounting for long-running entities. +	 * Ensure that runnable average is periodically updated.  	 */ -	update_entity_shares_tick(cfs_rq); +	update_entity_load_avg(curr, 1); +	update_cfs_rq_blocked_load(cfs_rq, 1);  #ifdef CONFIG_SCHED_HRTICK  	/* @@ -1448,6 +1828,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return &tg->cfs_bandwidth;  } +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ +	if (unlikely(cfs_rq->throttle_count)) +		return cfs_rq->throttled_clock_task; + +	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time; +} +  /* returns 0 on failure to allocate runtime */  static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)  { @@ -1592,14 +1981,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)  	cfs_rq->throttle_count--;  #ifdef CONFIG_SMP  	if (!cfs_rq->throttle_count) { -		u64 delta = rq->clock_task - cfs_rq->load_stamp; - -		/* leaving throttled state, advance shares averaging windows */ -		cfs_rq->load_stamp += delta; -		cfs_rq->load_last += delta; - -		/* update entity weight now that we are on_rq again */ -		update_cfs_shares(cfs_rq); +		/* adjust cfs_rq_clock_task() */ +		cfs_rq->throttled_clock_task_time += rq->clock_task - +					     cfs_rq->throttled_clock_task;  	}  #endif @@ -1611,9 +1995,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)  	struct rq *rq = data;  	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; -	/* group is entering throttled state, record last load */ +	/* group is entering throttled state, stop time */  	if (!cfs_rq->throttle_count) -		update_cfs_load(cfs_rq, 0); +		cfs_rq->throttled_clock_task = rq->clock_task;  	cfs_rq->throttle_count++;  	return 0; @@ -1628,7 +2012,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; -	/* account load preceding throttle */ +	/* freeze hierarchy runnable averages while throttled */  	rcu_read_lock();  	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);  	rcu_read_unlock(); @@ -1652,7 +2036,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  		rq->nr_running -= task_delta;  	cfs_rq->throttled = 1; -	cfs_rq->throttled_timestamp = rq->clock; +	cfs_rq->throttled_clock = rq->clock;  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	raw_spin_unlock(&cfs_b->lock); @@ -1670,10 +2054,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	cfs_rq->throttled = 0;  	raw_spin_lock(&cfs_b->lock); -	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; +	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;  	list_del_rcu(&cfs_rq->throttled_list);  	raw_spin_unlock(&cfs_b->lock); -	cfs_rq->throttled_timestamp = 0;  	update_rq_clock(rq);  	/* update hierarchical throttle state */ @@ -2073,8 +2456,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)  }  #else /* CONFIG_CFS_BANDWIDTH */ -static __always_inline -void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) +{ +	return rq_of(cfs_rq)->clock_task; +} + +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, +				     unsigned long delta_exec) {}  static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}  static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}  static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -2207,12 +2595,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break; -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); +		update_entity_load_avg(se, 1); +		update_cfs_rq_blocked_load(cfs_rq, 0);  	} -	if (!se) +	if (!se) { +		update_rq_runnable_avg(rq, rq->nr_running);  		inc_nr_running(rq); +	}  	hrtick_update(rq);  } @@ -2266,12 +2656,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		if (cfs_rq_throttled(cfs_rq))  			break; -		update_cfs_load(cfs_rq, 0); -		update_cfs_shares(cfs_rq); +		update_entity_load_avg(se, 1); +		update_cfs_rq_blocked_load(cfs_rq, 0);  	} -	if (!se) +	if (!se) {  		dec_nr_running(rq); +		update_rq_runnable_avg(rq, 1); +	}  	hrtick_update(rq);  } @@ -2781,6 +3173,37 @@ unlock:  	return new_cpu;  } + +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu.  However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +	struct sched_entity *se = &p->se; +	struct cfs_rq *cfs_rq = cfs_rq_of(se); + +	/* +	 * Load tracking: accumulate removed load so that it can be processed +	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute +	 * to blocked load iff they have a positive decay-count.  It can never +	 * be negative here since on-rq tasks have decay-count == 0. +	 */ +	if (se->avg.decay_count) { +		se->avg.decay_count = -__synchronize_entity_decay(se); +		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); +	} +} +#endif  #endif /* CONFIG_SMP */  static unsigned long @@ -3033,8 +3456,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp  #ifdef CONFIG_SMP  /************************************************** - * Fair scheduling class load-balancing methods: - */ + * Fair scheduling class load-balancing methods. + * + * BASICS + * + * The purpose of load-balancing is to achieve the same basic fairness the + * per-cpu scheduler provides, namely provide a proportional amount of compute + * time to each task. This is expressed in the following equation: + * + *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1) + * + * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight + * W_i,0 is defined as: + * + *   W_i,0 = \Sum_j w_i,j                                             (2) + * + * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight + * is derived from the nice value as per prio_to_weight[]. + * + * The weight average is an exponential decay average of the instantaneous + * weight: + * + *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3) + * + * P_i is the cpu power (or compute capacity) of cpu i, typically it is the + * fraction of 'recent' time available for SCHED_OTHER task execution. But it + * can also include other factors [XXX]. + * + * To achieve this balance we define a measure of imbalance which follows + * directly from (1): + * + *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4) + * + * We them move tasks around to minimize the imbalance. In the continuous + * function space it is obvious this converges, in the discrete case we get + * a few fun cases generally called infeasible weight scenarios. + * + * [XXX expand on: + *     - infeasible weights; + *     - local vs global optima in the discrete case. ] + * + * + * SCHED DOMAINS + * + * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) + * for all i,j solution, we create a tree of cpus that follows the hardware + * topology where each level pairs two lower groups (or better). This results + * in O(log n) layers. Furthermore we reduce the number of cpus going up the + * tree to only the first of the previous level and we decrease the frequency + * of load-balance at each level inv. proportional to the number of cpus in + * the groups. + * + * This yields: + * + *     log_2 n     1     n + *   \Sum       { --- * --- * 2^i } = O(n)                            (5) + *     i = 0      2^i   2^i + *                               `- size of each group + *         |         |     `- number of cpus doing load-balance + *         |         `- freq + *         `- sum over all levels + * + * Coupled with a limit on how many tasks we can migrate every balance pass, + * this makes (5) the runtime complexity of the balancer. + * + * An important property here is that each CPU is still (indirectly) connected + * to every other cpu in at most O(log n) steps: + * + * The adjacency matrix of the resulting graph is given by: + * + *             log_2 n      + *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6) + *             k = 0 + * + * And you'll find that: + * + *   A^(log_2 n)_i,j != 0  for all i,j                                (7) + * + * Showing there's indeed a path between every cpu in at most O(log n) steps. + * The task movement gives a factor of O(m), giving a convergence complexity + * of: + * + *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8) + * + * + * WORK CONSERVING + * + * In order to avoid CPUs going idle while there's still work to do, new idle + * balancing is more aggressive and has the newly idle cpu iterate up the domain + * tree itself instead of relying on other CPUs to bring it work. + * + * This adds some complexity to both (5) and (8) but it reduces the total idle + * time. + * + * [XXX more?] + * + * + * CGROUPS + * + * Cgroups make a horror show out of (2), instead of a simple sum we get: + * + *                                s_k,i + *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9) + *                                 S_k + * + * Where + * + *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10) + * + * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. + * + * The big problem is S_k, its a global sum needed to compute a local (W_i) + * property. + * + * [XXX write more on how we solve this.. _after_ merging pjt's patches that + *      rewrite all of this once again.] + */   static unsigned long __read_mostly max_load_balance_interval = HZ/10; @@ -3300,52 +3837,58 @@ next:  /*   * update tg->load_weight by folding this cpu's load_avg   */ -static int update_shares_cpu(struct task_group *tg, int cpu) +static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)  { -	struct cfs_rq *cfs_rq; -	unsigned long flags; -	struct rq *rq; - -	if (!tg->se[cpu]) -		return 0; - -	rq = cpu_rq(cpu); -	cfs_rq = tg->cfs_rq[cpu]; - -	raw_spin_lock_irqsave(&rq->lock, flags); - -	update_rq_clock(rq); -	update_cfs_load(cfs_rq, 1); +	struct sched_entity *se = tg->se[cpu]; +	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; -	/* -	 * We need to update shares after updating tg->load_weight in -	 * order to adjust the weight of groups with long running tasks. -	 */ -	update_cfs_shares(cfs_rq); +	/* throttled entities do not contribute to load */ +	if (throttled_hierarchy(cfs_rq)) +		return; -	raw_spin_unlock_irqrestore(&rq->lock, flags); +	update_cfs_rq_blocked_load(cfs_rq, 1); -	return 0; +	if (se) { +		update_entity_load_avg(se, 1); +		/* +		 * We pivot on our runnable average having decayed to zero for +		 * list removal.  This generally implies that all our children +		 * have also been removed (modulo rounding error or bandwidth +		 * control); however, such cases are rare and we can fix these +		 * at enqueue. +		 * +		 * TODO: fix up out-of-order children on enqueue. +		 */ +		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running) +			list_del_leaf_cfs_rq(cfs_rq); +	} else { +		struct rq *rq = rq_of(cfs_rq); +		update_rq_runnable_avg(rq, rq->nr_running); +	}  } -static void update_shares(int cpu) +static void update_blocked_averages(int cpu)  { -	struct cfs_rq *cfs_rq;  	struct rq *rq = cpu_rq(cpu); +	struct cfs_rq *cfs_rq; +	unsigned long flags; -	rcu_read_lock(); +	raw_spin_lock_irqsave(&rq->lock, flags); +	update_rq_clock(rq);  	/*  	 * Iterates the task_group tree in a bottom up fashion, see  	 * list_add_leaf_cfs_rq() for details.  	 */  	for_each_leaf_cfs_rq(rq, cfs_rq) { -		/* throttled entities do not contribute to load */ -		if (throttled_hierarchy(cfs_rq)) -			continue; - -		update_shares_cpu(cfs_rq->tg, cpu); +		/* +		 * Note: We may want to consider periodically releasing +		 * rq->lock about these updates so that creating many task +		 * groups does not result in continually extending hold time. +		 */ +		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);  	} -	rcu_read_unlock(); + +	raw_spin_unlock_irqrestore(&rq->lock, flags);  }  /* @@ -3397,7 +3940,7 @@ static unsigned long task_h_load(struct task_struct *p)  	return load;  }  #else -static inline void update_shares(int cpu) +static inline void update_blocked_averages(int cpu)  {  } @@ -4457,12 +5000,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	if (this_rq->avg_idle < sysctl_sched_migration_cost)  		return; +	update_rq_runnable_avg(this_rq, 1); +  	/*  	 * Drop the rq->lock, but keep IRQ/preempt disabled.  	 */  	raw_spin_unlock(&this_rq->lock); -	update_shares(this_cpu); +	update_blocked_averages(this_cpu);  	rcu_read_lock();  	for_each_domain(this_cpu, sd) {  		unsigned long interval; @@ -4717,7 +5262,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  	int update_next_balance = 0;  	int need_serialize; -	update_shares(cpu); +	update_blocked_averages(cpu);  	rcu_read_lock();  	for_each_domain(cpu, sd) { @@ -4954,6 +5499,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  		cfs_rq = cfs_rq_of(se);  		entity_tick(cfs_rq, se, queued);  	} + +	update_rq_runnable_avg(rq, 1);  }  /* @@ -5046,6 +5593,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)  		place_entity(cfs_rq, se, 0);  		se->vruntime -= cfs_rq->min_vruntime;  	} + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +	/* +	* Remove our load from contribution when we leave sched_fair +	* and ensure we don't carry in an old decay_count if we +	* switch back. +	*/ +	if (p->se.avg.decay_count) { +		struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); +		__synchronize_entity_decay(&p->se); +		subtract_blocked_load_contrib(cfs_rq, +				p->se.avg.load_avg_contrib); +	} +#endif  }  /* @@ -5092,11 +5653,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)  #ifndef CONFIG_64BIT  	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;  #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) +	atomic64_set(&cfs_rq->decay_counter, 1); +	atomic64_set(&cfs_rq->removed_load, 0); +#endif  }  #ifdef CONFIG_FAIR_GROUP_SCHED  static void task_move_group_fair(struct task_struct *p, int on_rq)  { +	struct cfs_rq *cfs_rq;  	/*  	 * If the task was not on the rq at the time of this cgroup movement  	 * it must have been asleep, sleeping tasks keep their ->vruntime @@ -5128,8 +5694,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)  	if (!on_rq)  		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;  	set_task_rq(p, task_cpu(p)); -	if (!on_rq) -		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; +	if (!on_rq) { +		cfs_rq = cfs_rq_of(&p->se); +		p->se.vruntime += cfs_rq->min_vruntime; +#ifdef CONFIG_SMP +		/* +		 * migrate_task_rq_fair() will have removed our previous +		 * contribution, but we must synchronize for ongoing future +		 * decay. +		 */ +		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter); +		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib; +#endif +	}  }  void free_fair_sched_group(struct task_group *tg) @@ -5214,10 +5791,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  	cfs_rq->tg = tg;  	cfs_rq->rq = rq; -#ifdef CONFIG_SMP -	/* allow initial update_cfs_load() to truncate */ -	cfs_rq->load_stamp = 1; -#endif  	init_cfs_rq_runtime(cfs_rq);  	tg->cfs_rq[cpu] = cfs_rq; @@ -5264,8 +5837,11 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)  		se = tg->se[i];  		/* Propagate contribution to hierarchy */  		raw_spin_lock_irqsave(&rq->lock, flags); -		for_each_sched_entity(se) +		for_each_sched_entity(se) {  			update_cfs_shares(group_cfs_rq(se)); +			/* update contribution to parent */ +			update_entity_load_avg(se, 1); +		}  		raw_spin_unlock_irqrestore(&rq->lock, flags);  	} @@ -5319,7 +5895,9 @@ const struct sched_class fair_sched_class = {  #ifdef CONFIG_SMP  	.select_task_rq		= select_task_rq_fair, - +#ifdef CONFIG_FAIR_GROUP_SCHED +	.migrate_task_rq	= migrate_task_rq_fair, +#endif  	.rq_online		= rq_online_fair,  	.rq_offline		= rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfab..5eca173b563 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -112,6 +112,8 @@ struct task_group {  	unsigned long shares;  	atomic_t load_weight; +	atomic64_t load_avg; +	atomic_t runnable_avg;  #endif  #ifdef CONFIG_RT_GROUP_SCHED @@ -222,22 +224,29 @@ struct cfs_rq {  	unsigned int nr_spread_over;  #endif +#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */  #ifdef CONFIG_FAIR_GROUP_SCHED -	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */ -  	/* -	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in -	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities -	 * (like users, containers etc.) -	 * -	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This -	 * list is used during load balance. +	 * CFS Load tracking +	 * Under CFS, load is tracked on a per-entity basis and aggregated up. +	 * This allows for the description of both thread and group usage (in +	 * the FAIR_GROUP_SCHED case).  	 */ -	int on_list; -	struct list_head leaf_cfs_rq_list; -	struct task_group *tg;	/* group that "owns" this runqueue */ +	u64 runnable_load_avg, blocked_load_avg; +	atomic64_t decay_counter, removed_load; +	u64 last_decay; +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED +	u32 tg_runnable_contrib; +	u64 tg_load_contrib; +#endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP  	/*  	 *   h_load = weight * f(tg)  	 * @@ -245,26 +254,30 @@ struct cfs_rq {  	 * this group.  	 */  	unsigned long h_load; +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */  	/* -	 * Maintaining per-cpu shares distribution for group scheduling +	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in +	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities +	 * (like users, containers etc.)  	 * -	 * load_stamp is the last time we updated the load average -	 * load_last is the last time we updated the load average and saw load -	 * load_unacc_exec_time is currently unaccounted execution time +	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This +	 * list is used during load balance.  	 */ -	u64 load_avg; -	u64 load_period; -	u64 load_stamp, load_last, load_unacc_exec_time; +	int on_list; +	struct list_head leaf_cfs_rq_list; +	struct task_group *tg;	/* group that "owns" this runqueue */ -	unsigned long load_contribution; -#endif /* CONFIG_SMP */  #ifdef CONFIG_CFS_BANDWIDTH  	int runtime_enabled;  	u64 runtime_expires;  	s64 runtime_remaining; -	u64 throttled_timestamp; +	u64 throttled_clock, throttled_clock_task; +	u64 throttled_clock_task_time;  	int throttled, throttle_count;  	struct list_head throttled_list;  #endif /* CONFIG_CFS_BANDWIDTH */ @@ -467,6 +480,8 @@ struct rq {  #ifdef CONFIG_SMP  	struct llist_head wake_list;  #endif + +	struct sched_avg avg;  };  static inline int cpu_of(struct rq *rq) @@ -1212,4 +1227,3 @@ static inline u64 irq_time_read(int cpu)  }  #endif /* CONFIG_64BIT */  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - diff --git a/kernel/signal.c b/kernel/signal.c index 2c681f11b7d..0af8868525d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -17,6 +17,7 @@  #include <linux/fs.h>  #include <linux/tty.h>  #include <linux/binfmts.h> +#include <linux/coredump.h>  #include <linux/security.h>  #include <linux/syscalls.h>  #include <linux/ptrace.h> @@ -2359,7 +2360,7 @@ relock:  			 * first and our do_group_exit call below will use  			 * that value and ignore the one we pass it.  			 */ -			do_coredump(info->si_signo, info->si_signo, regs); +			do_coredump(info, regs);  		}  		/* diff --git a/kernel/softirq.c b/kernel/softirq.c index cc96bdc0c2c..ed567babe78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)  	current->flags &= ~PF_MEMALLOC;  	pending = local_softirq_pending(); -	vtime_account(current); +	vtime_account_irq_enter(current);  	__local_bh_disable((unsigned long)__builtin_return_address(0),  				SOFTIRQ_OFFSET); @@ -272,7 +272,7 @@ restart:  	lockdep_softirq_exit(); -	vtime_account(current); +	vtime_account_irq_exit(current);  	__local_bh_enable(SOFTIRQ_OFFSET);  	tsk_restore_flags(current, old_flags, PF_MEMALLOC);  } @@ -341,7 +341,7 @@ static inline void invoke_softirq(void)   */  void irq_exit(void)  { -	vtime_account(current); +	vtime_account_irq_exit(current);  	trace_hardirq_exit();  	sub_preempt_count(IRQ_EXIT_OFFSET);  	if (!in_interrupt() && local_softirq_pending()) diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d..97c465ebd84 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,  	rcu_batch_queue(&sp->batch_queue, head);  	if (!sp->running) {  		sp->running = true; -		queue_delayed_work(system_nrt_wq, &sp->work, 0); +		schedule_delayed_work(&sp->work, 0);  	}  	spin_unlock_irqrestore(&sp->queue_lock, flags);  } @@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp)  	}  	if (pending) -		queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); +		schedule_delayed_work(&sp->work, SRCU_INTERVAL);  }  /* diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23ec..e6e0ece5f6a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);  void kernel_restart(char *cmd)  {  	kernel_restart_prepare(cmd); +	disable_nonboot_cpus();  	if (!cmd)  		printk(KERN_EMERG "Restarting system.\n");  	else @@ -1264,15 +1265,16 @@ DECLARE_RWSEM(uts_sem);   * Work around broken programs that cannot handle "Linux 3.0".   * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40   */ -static int override_release(char __user *release, int len) +static int override_release(char __user *release, size_t len)  {  	int ret = 0; -	char buf[65];  	if (current->personality & UNAME26) { -		char *rest = UTS_RELEASE; +		const char *rest = UTS_RELEASE; +		char buf[65] = { 0 };  		int ndots = 0;  		unsigned v; +		size_t copy;  		while (*rest) {  			if (*rest == '.' && ++ndots >= 3) @@ -1282,8 +1284,9 @@ static int override_release(char __user *release, int len)  			rest++;  		}  		v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; -		snprintf(buf, len, "2.6.%u%s", v, rest); -		ret = copy_to_user(release, buf, len); +		copy = clamp_t(size_t, len, 1, sizeof(buf)); +		copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); +		ret = copy_to_user(release, buf, copy + 1);  	}  	return ret;  } @@ -1788,15 +1791,15 @@ SYSCALL_DEFINE1(umask, int, mask)  #ifdef CONFIG_CHECKPOINT_RESTORE  static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  { -	struct file *exe_file; +	struct fd exe;  	struct dentry *dentry;  	int err; -	exe_file = fget(fd); -	if (!exe_file) +	exe = fdget(fd); +	if (!exe.file)  		return -EBADF; -	dentry = exe_file->f_path.dentry; +	dentry = exe.file->f_path.dentry;  	/*  	 * Because the original mm->exe_file points to executable file, make @@ -1805,7 +1808,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  	 */  	err = -EACCES;  	if (!S_ISREG(dentry->d_inode->i_mode)	|| -	    exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) +	    exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)  		goto exit;  	err = inode_permission(dentry->d_inode, MAY_EXEC); @@ -1839,12 +1842,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)  		goto exit_unlock;  	err = 0; -	set_mm_exe_file(mm, exe_file); +	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */  exit_unlock:  	up_write(&mm->mmap_sem);  exit: -	fput(exe_file); +	fdput(exe);  	return err;  } @@ -2204,7 +2207,7 @@ static int __orderly_poweroff(void)  		return -ENOMEM;  	} -	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, +	ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,  				      NULL, argv_cleanup, NULL);  	if (ret == -ENOMEM)  		argv_free(argv); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2914d0f752c..b0fa5ad0987 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,12 @@  extern int sysctl_overcommit_memory;  extern int sysctl_overcommit_ratio;  extern int max_threads; -extern int core_uses_pid;  extern int suid_dumpable; +#ifdef CONFIG_COREDUMP +extern int core_uses_pid;  extern char core_pattern[];  extern unsigned int core_pipe_limit; +#endif  extern int pid_max;  extern int min_free_kbytes;  extern int pid_max_min, pid_max_max; @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,  static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_COREDUMP  static int proc_dostring_coredump(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp, loff_t *ppos); +#endif  #ifdef CONFIG_MAGIC_SYSRQ  /* Note: sysrq code uses it's own private copy */ @@ -402,6 +406,7 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +#ifdef CONFIG_COREDUMP  	{  		.procname	= "core_uses_pid",  		.data		= &core_uses_pid, @@ -423,6 +428,7 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +#endif  #ifdef CONFIG_PROC_SYSCTL  	{  		.procname	= "tainted", @@ -1541,8 +1547,7 @@ static struct ctl_table fs_table[] = {  };  static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ -    defined(CONFIG_S390) || defined(CONFIG_TILE) +#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE  	{  		.procname	= "exception-trace",  		.data		= &show_unhandled_signals, @@ -2034,12 +2039,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,  static void validate_coredump_safety(void)  { +#ifdef CONFIG_COREDUMP  	if (suid_dumpable == SUID_DUMPABLE_SAFE &&  	    core_pattern[0] != '/' && core_pattern[0] != '|') {  		printk(KERN_WARNING "Unsafe core_pattern used with "\  			"suid_dumpable=2. Pipe handler or fully qualified "\  			"core dump path required.\n");  	} +#endif  }  static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, @@ -2051,6 +2058,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write,  	return error;  } +#ifdef CONFIG_COREDUMP  static int proc_dostring_coredump(struct ctl_table *table, int write,  		  void __user *buffer, size_t *lenp, loff_t *ppos)  { @@ -2059,6 +2067,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,  		validate_coredump_safety();  	return error;  } +#endif  static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,  				     void __user *buffer, diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550..145bb4d3bd4 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -27,6 +27,7 @@  #include <linux/cgroup.h>  #include <linux/fs.h>  #include <linux/file.h> +#include <linux/pid_namespace.h>  #include <net/genetlink.h>  #include <linux/atomic.h> @@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb,  	up_write(&listeners->sem);  } -static void fill_stats(struct task_struct *tsk, struct taskstats *stats) +static void fill_stats(struct user_namespace *user_ns, +		       struct pid_namespace *pid_ns, +		       struct task_struct *tsk, struct taskstats *stats)  {  	memset(stats, 0, sizeof(*stats));  	/* @@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats)  	stats->version = TASKSTATS_VERSION;  	stats->nvcsw = tsk->nvcsw;  	stats->nivcsw = tsk->nivcsw; -	bacct_add_tsk(stats, tsk); +	bacct_add_tsk(user_ns, pid_ns, stats, tsk);  	/* fill in extended acct fields */  	xacct_add_tsk(stats, tsk); @@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)  	rcu_read_unlock();  	if (!tsk)  		return -ESRCH; -	fill_stats(tsk, stats); +	fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats);  	put_task_struct(tsk);  	return 0;  } @@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)  	if (!cpumask_subset(mask, cpu_possible_mask))  		return -EINVAL; +	if (current_user_ns() != &init_user_ns) +		return -EINVAL; + +	if (task_active_pid_ns(current) != &init_pid_ns) +		return -EINVAL; +  	if (isadd == REGISTER) {  		for_each_cpu(cpu, mask) {  			s = kmalloc_node(sizeof(struct listener), @@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	struct nlattr *na;  	size_t size;  	u32 fd; -	struct file *file; -	int fput_needed; +	struct fd f;  	na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];  	if (!na)  		return -EINVAL;  	fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); -	file = fget_light(fd, &fput_needed); -	if (!file) +	f = fdget(fd); +	if (!f.file)  		return 0;  	size = nla_total_size(sizeof(struct cgroupstats)); @@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,  				sizeof(struct cgroupstats));  	if (na == NULL) { +		nlmsg_free(rep_skb);  		rc = -EMSGSIZE;  		goto err;  	} @@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	stats = nla_data(na);  	memset(stats, 0, sizeof(*stats)); -	rc = cgroupstats_build(stats, file->f_dentry); +	rc = cgroupstats_build(stats, f.file->f_dentry);  	if (rc < 0) {  		nlmsg_free(rep_skb);  		goto err; @@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)  	rc = send_reply(rep_skb, info);  err: -	fput_light(file, fput_needed); +	fdput(f);  	return rc;  } @@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info)  	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);  	if (rc < 0)  		goto out; -	rc = add_del_listener(info->snd_pid, mask, REGISTER); +	rc = add_del_listener(info->snd_portid, mask, REGISTER);  out:  	free_cpumask_var(mask);  	return rc; @@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info)  	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);  	if (rc < 0)  		goto out; -	rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +	rc = add_del_listener(info->snd_portid, mask, DEREGISTER);  out:  	free_cpumask_var(mask);  	return rc; @@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)  	if (rc < 0)  		return; -	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); +	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, +			 task_pid_nr_ns(tsk, &init_pid_ns));  	if (!stats)  		goto err; -	fill_stats(tsk, stats); +	fill_stats(&init_user_ns, &init_pid_ns, tsk, stats);  	/*  	 * Doesn't matter if tsk is the leader or the last group member leaving @@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)  	if (!is_thread_group || !group_dead)  		goto send; -	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); +	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, +			 task_tgid_nr_ns(tsk, &init_pid_ns));  	if (!stats)  		goto err; diff --git a/kernel/time.c b/kernel/time.c index ba744cf8069..d226c6a3fd2 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -30,7 +30,7 @@  #include <linux/export.h>  #include <linux/timex.h>  #include <linux/capability.h> -#include <linux/clocksource.h> +#include <linux/timekeeper_internal.h>  #include <linux/errno.h>  #include <linux/syscalls.h>  #include <linux/security.h> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fd42bd452b7..8601f0db126 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -16,6 +16,10 @@ config ARCH_CLOCKSOURCE_DATA  config GENERIC_TIME_VSYSCALL  	bool +# Timekeeping vsyscall support +config GENERIC_TIME_VSYSCALL_OLD +	bool +  # ktime_t scalar 64bit nsec representation  config KTIME_SCALAR  	bool diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index aa27d391bfc..f11d83b1294 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -37,7 +37,6 @@  static struct alarm_base {  	spinlock_t		lock;  	struct timerqueue_head	timerqueue; -	struct hrtimer		timer;  	ktime_t			(*gettime)(void);  	clockid_t		base_clockid;  } alarm_bases[ALARM_NUMTYPE]; @@ -46,6 +45,8 @@ static struct alarm_base {  static ktime_t freezer_delta;  static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; +  #ifdef CONFIG_RTC_CLASS  /* rtc timer and device for setting alarm wakeups at suspend */  static struct rtc_timer		rtctimer; @@ -130,50 +131,35 @@ static inline void alarmtimer_rtc_timer_init(void) { }   * @base: pointer to the base where the timer is being run   * @alarm: pointer to alarm being enqueued.   * - * Adds alarm to a alarm_base timerqueue and if necessary sets - * an hrtimer to run. + * Adds alarm to a alarm_base timerqueue   *   * Must hold base->lock when calling.   */  static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)  { +	if (alarm->state & ALARMTIMER_STATE_ENQUEUED) +		timerqueue_del(&base->timerqueue, &alarm->node); +  	timerqueue_add(&base->timerqueue, &alarm->node);  	alarm->state |= ALARMTIMER_STATE_ENQUEUED; - -	if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { -		hrtimer_try_to_cancel(&base->timer); -		hrtimer_start(&base->timer, alarm->node.expires, -				HRTIMER_MODE_ABS); -	}  }  /** - * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * alarmtimer_dequeue - Removes an alarm timer from an alarm_base timerqueue   * @base: pointer to the base where the timer is running   * @alarm: pointer to alarm being removed   * - * Removes alarm to a alarm_base timerqueue and if necessary sets - * a new timer to run. + * Removes alarm to a alarm_base timerqueue   *   * Must hold base->lock when calling.   */ -static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +static void alarmtimer_dequeue(struct alarm_base *base, struct alarm *alarm)  { -	struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); -  	if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED))  		return;  	timerqueue_del(&base->timerqueue, &alarm->node);  	alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - -	if (next == &alarm->node) { -		hrtimer_try_to_cancel(&base->timer); -		next = timerqueue_getnext(&base->timerqueue); -		if (!next) -			return; -		hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); -	}  } @@ -188,42 +174,23 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)   */  static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)  { -	struct alarm_base *base = container_of(timer, struct alarm_base, timer); -	struct timerqueue_node *next; +	struct alarm *alarm = container_of(timer, struct alarm, timer); +	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; -	ktime_t now;  	int ret = HRTIMER_NORESTART;  	int restart = ALARMTIMER_NORESTART;  	spin_lock_irqsave(&base->lock, flags); -	now = base->gettime(); -	while ((next = timerqueue_getnext(&base->timerqueue))) { -		struct alarm *alarm; -		ktime_t expired = next->expires; - -		if (expired.tv64 > now.tv64) -			break; - -		alarm = container_of(next, struct alarm, node); - -		timerqueue_del(&base->timerqueue, &alarm->node); -		alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; - -		alarm->state |= ALARMTIMER_STATE_CALLBACK; -		spin_unlock_irqrestore(&base->lock, flags); -		if (alarm->function) -			restart = alarm->function(alarm, now); -		spin_lock_irqsave(&base->lock, flags); -		alarm->state &= ~ALARMTIMER_STATE_CALLBACK; +	alarmtimer_dequeue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); -		if (restart != ALARMTIMER_NORESTART) { -			timerqueue_add(&base->timerqueue, &alarm->node); -			alarm->state |= ALARMTIMER_STATE_ENQUEUED; -		} -	} +	if (alarm->function) +		restart = alarm->function(alarm, base->gettime()); -	if (next) { -		hrtimer_set_expires(&base->timer, next->expires); +	spin_lock_irqsave(&base->lock, flags); +	if (restart != ALARMTIMER_NORESTART) { +		hrtimer_set_expires(&alarm->timer, alarm->node.expires); +		alarmtimer_enqueue(base, alarm);  		ret = HRTIMER_RESTART;  	}  	spin_unlock_irqrestore(&base->lock, flags); @@ -250,6 +217,7 @@ static int alarmtimer_suspend(struct device *dev)  	unsigned long flags;  	struct rtc_device *rtc;  	int i; +	int ret;  	spin_lock_irqsave(&freezer_delta_lock, flags);  	min = freezer_delta; @@ -279,8 +247,10 @@ static int alarmtimer_suspend(struct device *dev)  	if (min.tv64 == 0)  		return 0; -	/* XXX - Should we enforce a minimum sleep time? */ -	WARN_ON(min.tv64 < NSEC_PER_SEC); +	if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { +		__pm_wakeup_event(ws, 2 * MSEC_PER_SEC); +		return -EBUSY; +	}  	/* Setup an rtc timer to fire that far in the future */  	rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +258,11 @@ static int alarmtimer_suspend(struct device *dev)  	now = rtc_tm_to_ktime(tm);  	now = ktime_add(now, min); -	rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - -	return 0; +	/* Set alarm, if in the past reject suspend briefly to handle */ +	ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); +	if (ret < 0) +		__pm_wakeup_event(ws, MSEC_PER_SEC); +	return ret;  }  #else  static int alarmtimer_suspend(struct device *dev) @@ -324,6 +296,9 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,  		enum alarmtimer_restart (*function)(struct alarm *, ktime_t))  {  	timerqueue_init(&alarm->node); +	hrtimer_init(&alarm->timer, alarm_bases[type].base_clockid, +			HRTIMER_MODE_ABS); +	alarm->timer.function = alarmtimer_fired;  	alarm->function = function;  	alarm->type = type;  	alarm->state = ALARMTIMER_STATE_INACTIVE; @@ -334,17 +309,19 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,   * @alarm: ptr to alarm to set   * @start: time to run the alarm   */ -void alarm_start(struct alarm *alarm, ktime_t start) +int alarm_start(struct alarm *alarm, ktime_t start)  {  	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; +	int ret;  	spin_lock_irqsave(&base->lock, flags); -	if (alarmtimer_active(alarm)) -		alarmtimer_remove(base, alarm);  	alarm->node.expires = start;  	alarmtimer_enqueue(base, alarm); +	ret = hrtimer_start(&alarm->timer, alarm->node.expires, +				HRTIMER_MODE_ABS);  	spin_unlock_irqrestore(&base->lock, flags); +	return ret;  }  /** @@ -358,18 +335,12 @@ int alarm_try_to_cancel(struct alarm *alarm)  {  	struct alarm_base *base = &alarm_bases[alarm->type];  	unsigned long flags; -	int ret = -1; -	spin_lock_irqsave(&base->lock, flags); - -	if (alarmtimer_callback_running(alarm)) -		goto out; +	int ret; -	if (alarmtimer_is_queued(alarm)) { -		alarmtimer_remove(base, alarm); -		ret = 1; -	} else -		ret = 0; -out: +	spin_lock_irqsave(&base->lock, flags); +	ret = hrtimer_try_to_cancel(&alarm->timer); +	if (ret >= 0) +		alarmtimer_dequeue(base, alarm);  	spin_unlock_irqrestore(&base->lock, flags);  	return ret;  } @@ -802,10 +773,6 @@ static int __init alarmtimer_init(void)  	for (i = 0; i < ALARM_NUMTYPE; i++) {  		timerqueue_init_head(&alarm_bases[i].timerqueue);  		spin_lock_init(&alarm_bases[i].lock); -		hrtimer_init(&alarm_bases[i].timer, -				alarm_bases[i].base_clockid, -				HRTIMER_MODE_ABS); -		alarm_bases[i].timer.function = alarmtimer_fired;  	}  	error = alarmtimer_rtc_interface_setup(); @@ -821,6 +788,7 @@ static int __init alarmtimer_init(void)  		error = PTR_ERR(pdev);  		goto out_drv;  	} +	ws = wakeup_source_register("alarmtimer");  	return 0;  out_drv: diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a85..30b6de0d977 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old,  	local_irq_restore(flags);  } +/** + * clockevents_suspend - suspend clock devices + */ +void clockevents_suspend(void) +{ +	struct clock_event_device *dev; + +	list_for_each_entry_reverse(dev, &clockevent_devices, list) +		if (dev->suspend) +			dev->suspend(dev); +} + +/** + * clockevents_resume - resume clock devices + */ +void clockevents_resume(void) +{ +	struct clock_event_device *dev; + +	list_for_each_entry(dev, &clockevent_devices, list) +		if (dev->resume) +			dev->resume(dev); +} +  #ifdef CONFIG_GENERIC_CLOCKEVENTS  /**   * clockevents_notify - notification about relevant events diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 46da0537c10..6629bf7b528 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@   * requested HZ value. It is also not recommended   * for "tick-less" systems.   */ -#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) +#define NSEC_PER_JIFFY	((NSEC_PER_SEC+HZ/2)/HZ)  /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier   * conversion, the .shift value could be zero. However @@ -95,3 +95,33 @@ struct clocksource * __init __weak clocksource_default_clock(void)  {  	return &clocksource_jiffies;  } + +struct clocksource refined_jiffies; + +int register_refined_jiffies(long cycles_per_second) +{ +	u64 nsec_per_tick, shift_hz; +	long cycles_per_tick; + + + +	refined_jiffies = clocksource_jiffies; +	refined_jiffies.name = "refined-jiffies"; +	refined_jiffies.rating++; + +	/* Calc cycles per tick */ +	cycles_per_tick = (cycles_per_second + HZ/2)/HZ; +	/* shift_hz stores hz<<8 for extra accuracy */ +	shift_hz = (u64)cycles_per_second << 8; +	shift_hz += cycles_per_tick/2; +	do_div(shift_hz, cycles_per_tick); +	/* Calculate nsec_per_tick using shift_hz */ +	nsec_per_tick = (u64)NSEC_PER_SEC << 8; +	nsec_per_tick += (u32)shift_hz/2; +	do_div(nsec_per_tick, (u32)shift_hz); + +	refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; + +	clocksource_register(&refined_jiffies); +	return 0; +} diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cec..e424970bb56 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -8,6 +8,7 @@   *   */ +#include <linux/timekeeper_internal.h>  #include <linux/module.h>  #include <linux/interrupt.h>  #include <linux/percpu.h> @@ -21,61 +22,6 @@  #include <linux/tick.h>  #include <linux/stop_machine.h> -/* Structure holding internal timekeeping values. */ -struct timekeeper { -	/* Current clocksource used for timekeeping. */ -	struct clocksource	*clock; -	/* NTP adjusted clock multiplier */ -	u32			mult; -	/* The shift value of the current clocksource. */ -	u32			shift; -	/* Number of clock cycles in one NTP interval. */ -	cycle_t			cycle_interval; -	/* Number of clock shifted nano seconds in one NTP interval. */ -	u64			xtime_interval; -	/* shifted nano seconds left over when rounding cycle_interval */ -	s64			xtime_remainder; -	/* Raw nano seconds accumulated per NTP interval. */ -	u32			raw_interval; - -	/* Current CLOCK_REALTIME time in seconds */ -	u64			xtime_sec; -	/* Clock shifted nano seconds */ -	u64			xtime_nsec; - -	/* Difference between accumulated time and NTP time in ntp -	 * shifted nano seconds. */ -	s64			ntp_error; -	/* Shift conversion between clock shifted nano seconds and -	 * ntp shifted nano seconds. */ -	u32			ntp_error_shift; - -	/* -	 * wall_to_monotonic is what we need to add to xtime (or xtime corrected -	 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged -	 * at zero at system boot time, so wall_to_monotonic will be negative, -	 * however, we will ALWAYS keep the tv_nsec part positive so we can use -	 * the usual normalization. -	 * -	 * wall_to_monotonic is moved after resume from suspend for the -	 * monotonic time not to jump. We need to add total_sleep_time to -	 * wall_to_monotonic to get the real boot based time offset. -	 * -	 * - wall_to_monotonic is no longer the boot time, getboottime must be -	 * used instead. -	 */ -	struct timespec		wall_to_monotonic; -	/* Offset clock monotonic -> clock realtime */ -	ktime_t			offs_real; -	/* time spent in suspend */ -	struct timespec		total_sleep_time; -	/* Offset clock monotonic -> clock boottime */ -	ktime_t			offs_boot; -	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -	struct timespec		raw_time; -	/* Seqlock for all timekeeper values */ -	seqlock_t		lock; -};  static struct timekeeper timekeeper; @@ -96,15 +42,6 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)  	}  } -static struct timespec tk_xtime(struct timekeeper *tk) -{ -	struct timespec ts; - -	ts.tv_sec = tk->xtime_sec; -	ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift); -	return ts; -} -  static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)  {  	tk->xtime_sec = ts->tv_sec; @@ -246,14 +183,11 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  /* must hold write on timekeeper.lock */  static void timekeeping_update(struct timekeeper *tk, bool clearntp)  { -	struct timespec xt; -  	if (clearntp) {  		tk->ntp_error = 0;  		ntp_clear();  	} -	xt = tk_xtime(tk); -	update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult); +	update_vsyscall(tk);  }  /** @@ -776,6 +710,7 @@ static void timekeeping_resume(void)  	read_persistent_clock(&ts); +	clockevents_resume();  	clocksource_resume();  	write_seqlock_irqsave(&tk->lock, flags); @@ -835,6 +770,7 @@ static int timekeeping_suspend(void)  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);  	clocksource_suspend(); +	clockevents_suspend();  	return 0;  } @@ -1111,7 +1047,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	accumulate_nsecs_to_secs(tk);  	/* Accumulate raw time */ -	raw_nsecs = tk->raw_interval << shift; +	raw_nsecs = (u64)tk->raw_interval << shift;  	raw_nsecs += tk->raw_time.tv_nsec;  	if (raw_nsecs >= NSEC_PER_SEC) {  		u64 raw_secs = raw_nsecs; @@ -1128,6 +1064,33 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	return offset;  } +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ +	s64 remainder; + +	/* +	* Store only full nanoseconds into xtime_nsec after rounding +	* it up and add the remainder to the error difference. +	* XXX - This is necessary to avoid small 1ns inconsistnecies caused +	* by truncating the remainder in vsyscalls. However, it causes +	* additional work to be done in timekeeping_adjust(). Once +	* the vsyscall implementations are converted to use xtime_nsec +	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD +	* users are removed, this can be killed. +	*/ +	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); +	tk->xtime_nsec -= remainder; +	tk->xtime_nsec += 1ULL << tk->shift; +	tk->ntp_error += remainder << tk->ntp_error_shift; + +} +#else +#define old_vsyscall_fixup(tk) +#endif + + +  /**   * update_wall_time - Uses the current clocksource to increment the wall time   * @@ -1139,7 +1102,6 @@ static void update_wall_time(void)  	cycle_t offset;  	int shift = 0, maxshift;  	unsigned long flags; -	s64 remainder;  	write_seqlock_irqsave(&tk->lock, flags); @@ -1181,20 +1143,11 @@ static void update_wall_time(void)  	/* correct the clock when NTP error is too big */  	timekeeping_adjust(tk, offset); -  	/* -	* Store only full nanoseconds into xtime_nsec after rounding -	* it up and add the remainder to the error difference. -	* XXX - This is necessary to avoid small 1ns inconsistnecies caused -	* by truncating the remainder in vsyscalls. However, it causes -	* additional work to be done in timekeeping_adjust(). Once -	* the vsyscall implementations are converted to use xtime_nsec -	* (shifted nanoseconds), this can be killed. -	*/ -	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); -	tk->xtime_nsec -= remainder; -	tk->xtime_nsec += 1ULL << tk->shift; -	tk->ntp_error += remainder << tk->ntp_error_shift; +	 * XXX This can be killed once everyone converts +	 * to the new update_vsyscall. +	 */ +	old_vsyscall_fixup(tk);  	/*  	 * Finally, make sure that after the rounding diff --git a/kernel/timer.c b/kernel/timer.c index d5de1b2292a..367d0085848 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);  #define TVR_SIZE (1 << TVR_BITS)  #define TVN_MASK (TVN_SIZE - 1)  #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))  struct tvec {  	struct list_head vec[TVN_SIZE]; @@ -359,11 +360,12 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)  		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);  	} else {  		int i; -		/* If the timeout is larger than 0xffffffff on 64-bit -		 * architectures then we use the maximum timeout: +		/* If the timeout is larger than MAX_TVAL (on 64-bit +		 * architectures or with CONFIG_BASE_SMALL=1) then we +		 * use the maximum timeout.  		 */ -		if (idx > 0xffffffffUL) { -			idx = 0xffffffffUL; +		if (idx > MAX_TVAL) { +			idx = MAX_TVAL;  			expires = idx + base->timer_jiffies;  		}  		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b32ed0e385a..b979426d16c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1567,6 +1567,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  		put_online_cpus();  	} else { +		/* Make sure this CPU has been intitialized */ +		if (!cpumask_test_cpu(cpu_id, buffer->cpumask)) +			goto out; +  		cpu_buffer = buffer->buffers[cpu_id];  		if (nr_pages == cpu_buffer->nr_pages) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ec5c1dab62..31e4f55773f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2061,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)  	seq_puts(m, "#    -----------------\n");  	seq_printf(m, "#    | task: %.16s-%d "  		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", -		   data->comm, data->pid, data->uid, data->nice, +		   data->comm, data->pid, +		   from_kuid_munged(seq_user_ns(m), data->uid), data->nice,  		   data->policy, data->rt_priority);  	seq_puts(m, "#    -----------------\n"); @@ -4199,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,  	buf->private = 0;  } -static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, -				 struct pipe_buffer *buf) -{ -	return 1; -} -  static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,  				struct pipe_buffer *buf)  { @@ -4220,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {  	.unmap			= generic_pipe_buf_unmap,  	.confirm		= generic_pipe_buf_confirm,  	.release		= buffer_pipe_buf_release, -	.steal			= buffer_pipe_buf_steal, +	.steal			= generic_pipe_buf_steal,  	.get			= buffer_pipe_buf_get,  }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 63a2da0b9a6..c15f528c1af 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -147,7 +147,7 @@ struct trace_array_cpu {  	unsigned long		skipped_entries;  	cycle_t			preempt_timestamp;  	pid_t			pid; -	uid_t			uid; +	kuid_t			uid;  	char			comm[TASK_COMM_LEN];  }; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 483162a9f90..507a7a9630b 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,7 +13,6 @@  #include <linux/debugfs.h>  #include <linux/uaccess.h>  #include <linux/ftrace.h> -#include <linux/pstore.h>  #include <linux/fs.h>  #include "trace.h" @@ -76,10 +75,9 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,  	preempt_enable_notrace();  } -/* Our two options */ +/* Our option */  enum {  	TRACE_FUNC_OPT_STACK	= 0x1, -	TRACE_FUNC_OPT_PSTORE	= 0x2,  };  static struct tracer_flags func_flags; @@ -109,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,  	disabled = atomic_inc_return(&data->disabled);  	if (likely(disabled == 1)) { -		/* -		 * So far tracing doesn't support multiple buffers, so -		 * we make an explicit call for now. -		 */ -		if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) -			pstore_ftrace_call(ip, parent_ip);  		pc = preempt_count();  		trace_function(tr, ip, parent_ip, flags, pc);  	} @@ -181,9 +173,6 @@ static struct tracer_opt func_opts[] = {  #ifdef CONFIG_STACKTRACE  	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },  #endif -#ifdef CONFIG_PSTORE_FTRACE -	{ TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, -#endif  	{ } /* Always set a last empty entry */  }; @@ -236,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)  		}  		break; -	case TRACE_FUNC_OPT_PSTORE: -		break;  	default:  		return -EINVAL;  	} diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebd..625df0b4469 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -26,7 +26,9 @@  /*   * fill in basic accounting fields   */ -void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +void bacct_add_tsk(struct user_namespace *user_ns, +		   struct pid_namespace *pid_ns, +		   struct taskstats *stats, struct task_struct *tsk)  {  	const struct cred *tcred;  	struct timespec uptime, ts; @@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)  		stats->ac_flag |= AXSIG;  	stats->ac_nice	 = task_nice(tsk);  	stats->ac_sched	 = tsk->policy; -	stats->ac_pid	 = tsk->pid; +	stats->ac_pid	 = task_pid_nr_ns(tsk, pid_ns);  	rcu_read_lock();  	tcred = __task_cred(tsk); -	stats->ac_uid	 = tcred->uid; -	stats->ac_gid	 = tcred->gid; +	stats->ac_uid	 = from_kuid_munged(user_ns, tcred->uid); +	stats->ac_gid	 = from_kgid_munged(user_ns, tcred->gid);  	stats->ac_ppid	 = pid_alive(tsk) ? -				rcu_dereference(tsk->real_parent)->tgid : 0; +		task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0;  	rcu_read_unlock();  	stats->ac_utime = cputime_to_usecs(tsk->utime);  	stats->ac_stime = cputime_to_usecs(tsk->stime); diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76..750acffbe9e 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -38,6 +38,14 @@ struct user_namespace init_user_ns = {  			.count = 4294967295U,  		},  	}, +	.projid_map = { +		.nr_extents = 1, +		.extent[0] = { +			.first = 0, +			.lower_first = 0, +			.count = 4294967295U, +		}, +	},  	.kref = {  		.refcount	= ATOMIC_INIT(3),  	}, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422..456a6b9fba3 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -19,6 +19,7 @@  #include <linux/fs.h>  #include <linux/uaccess.h>  #include <linux/ctype.h> +#include <linux/projid.h>  static struct kmem_cache *user_ns_cachep __read_mostly; @@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)  }  EXPORT_SYMBOL(from_kgid_munged); +/** + *	make_kprojid - Map a user-namespace projid pair into a kprojid. + *	@ns:  User namespace that the projid is in + *	@projid: Project identifier + * + *	Maps a user-namespace uid pair into a kernel internal kuid, + *	and returns that kuid. + * + *	When there is no mapping defined for the user-namespace projid + *	pair INVALID_PROJID is returned.  Callers are expected to test + *	for and handle handle INVALID_PROJID being returned.  INVALID_PROJID + *	may be tested for using projid_valid(). + */ +kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) +{ +	/* Map the uid to a global kernel uid */ +	return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); +} +EXPORT_SYMBOL(make_kprojid); + +/** + *	from_kprojid - Create a projid from a kprojid user-namespace pair. + *	@targ: The user namespace we want a projid in. + *	@kprojid: The kernel internal project identifier to start with. + * + *	Map @kprojid into the user-namespace specified by @targ and + *	return the resulting projid. + * + *	There is always a mapping into the initial user_namespace. + * + *	If @kprojid has no mapping in @targ (projid_t)-1 is returned. + */ +projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) +{ +	/* Map the uid from a global kernel uid */ +	return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); +} +EXPORT_SYMBOL(from_kprojid); + +/** + *	from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. + *	@targ: The user namespace we want a projid in. + *	@kprojid: The kernel internal projid to start with. + * + *	Map @kprojid into the user-namespace specified by @targ and + *	return the resulting projid. + * + *	There is always a mapping into the initial user_namespace. + * + *	Unlike from_kprojid from_kprojid_munged never fails and always + *	returns a valid projid.  This makes from_kprojid_munged + *	appropriate for use in syscalls like stat and where + *	failing the system call and failing to provide a valid projid are + *	not an options. + * + *	If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. + */ +projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) +{ +	projid_t projid; +	projid = from_kprojid(targ, kprojid); + +	if (projid == (projid_t) -1) +		projid = OVERFLOW_PROJID; +	return projid; +} +EXPORT_SYMBOL(from_kprojid_munged); + +  static int uid_m_show(struct seq_file *seq, void *v)  {  	struct user_namespace *ns = seq->private; @@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v)  	return 0;  } +static int projid_m_show(struct seq_file *seq, void *v) +{ +	struct user_namespace *ns = seq->private; +	struct uid_gid_extent *extent = v; +	struct user_namespace *lower_ns; +	projid_t lower; + +	lower_ns = seq_user_ns(seq); +	if ((lower_ns == ns) && lower_ns->parent) +		lower_ns = lower_ns->parent; + +	lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); + +	seq_printf(seq, "%10u %10u %10u\n", +		extent->first, +		lower, +		extent->count); + +	return 0; +} +  static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)  {  	struct uid_gid_extent *extent = NULL; @@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos)  	return m_start(seq, ppos, &ns->gid_map);  } +static void *projid_m_start(struct seq_file *seq, loff_t *ppos) +{ +	struct user_namespace *ns = seq->private; + +	return m_start(seq, ppos, &ns->projid_map); +} +  static void *m_next(struct seq_file *seq, void *v, loff_t *pos)  {  	(*pos)++; @@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = {  	.show = gid_m_show,  }; +struct seq_operations proc_projid_seq_operations = { +	.start = projid_m_start, +	.stop = m_stop, +	.next = m_next, +	.show = projid_m_show, +}; +  static DEFINE_MUTEX(id_map_mutex);  static ssize_t map_write(struct file *file, const char __user *buf, @@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,  	/* Require the appropriate privilege CAP_SETUID or CAP_SETGID  	 * over the user namespace in order to set the id mapping.  	 */ -	if (!ns_capable(ns, cap_setid)) +	if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))  		goto out;  	/* Get a buffer */ @@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz  			 &ns->gid_map, &ns->parent->gid_map);  } +ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) +{ +	struct seq_file *seq = file->private_data; +	struct user_namespace *ns = seq->private; +	struct user_namespace *seq_ns = seq_user_ns(seq); + +	if (!ns->parent) +		return -EPERM; + +	if ((seq_ns != ns) && (seq_ns != ns->parent)) +		return -EPERM; + +	/* Anyone can set any valid project id no capability needed */ +	return map_write(file, buf, size, ppos, -1, +			 &ns->projid_map, &ns->parent->projid_map); +} +  static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,  				struct uid_gid_map *new_map)  { +	/* Allow anyone to set a mapping that doesn't require privilege */ +	if (!cap_valid(cap_setid)) +		return true; +  	/* Allow the specified ids if we have the appropriate capability  	 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.  	 */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c5a79e2134..042d221d33c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -58,7 +58,7 @@ enum {  	 * be executing on any CPU.  The gcwq behaves as an unbound one.  	 *  	 * Note that DISASSOCIATED can be flipped only while holding -	 * managership of all pools on the gcwq to avoid changing binding +	 * assoc_mutex of all pools on the gcwq to avoid changing binding  	 * state while create_worker() is in progress.  	 */  	GCWQ_DISASSOCIATED	= 1 << 0,	/* cpu can't serve workers */ @@ -73,11 +73,10 @@ enum {  	WORKER_DIE		= 1 << 1,	/* die die die */  	WORKER_IDLE		= 1 << 2,	/* is idle */  	WORKER_PREP		= 1 << 3,	/* preparing to run works */ -	WORKER_REBIND		= 1 << 5,	/* mom is home, come back */  	WORKER_CPU_INTENSIVE	= 1 << 6,	/* cpu intensive */  	WORKER_UNBOUND		= 1 << 7,	/* worker is unbound */ -	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | +	WORKER_NOT_RUNNING	= WORKER_PREP | WORKER_UNBOUND |  				  WORKER_CPU_INTENSIVE,  	NR_WORKER_POOLS		= 2,		/* # worker pools per gcwq */ @@ -126,7 +125,6 @@ enum {  struct global_cwq;  struct worker_pool; -struct idle_rebind;  /*   * The poor guys doing the actual heavy lifting.  All on-duty workers @@ -150,7 +148,6 @@ struct worker {  	int			id;		/* I: worker id */  	/* for rebinding worker to CPU */ -	struct idle_rebind	*idle_rebind;	/* L: for idle worker */  	struct work_struct	rebind_work;	/* L: for busy worker */  }; @@ -160,13 +157,15 @@ struct worker_pool {  	struct list_head	worklist;	/* L: list of pending works */  	int			nr_workers;	/* L: total number of workers */ + +	/* nr_idle includes the ones off idle_list for rebinding */  	int			nr_idle;	/* L: currently idle ones */  	struct list_head	idle_list;	/* X: list of idle workers */  	struct timer_list	idle_timer;	/* L: worker idle timeout */  	struct timer_list	mayday_timer;	/* L: SOS timer for workers */ -	struct mutex		manager_mutex;	/* mutex manager should hold */ +	struct mutex		assoc_mutex;	/* protect GCWQ_DISASSOCIATED */  	struct ida		worker_ida;	/* L: for worker IDs */  }; @@ -184,9 +183,8 @@ struct global_cwq {  	struct hlist_head	busy_hash[BUSY_WORKER_HASH_SIZE];  						/* L: hash of busy workers */ -	struct worker_pool	pools[2];	/* normal and highpri pools */ - -	wait_queue_head_t	rebind_hold;	/* rebind hold wait */ +	struct worker_pool	pools[NR_WORKER_POOLS]; +						/* normal and highpri pools */  } ____cacheline_aligned_in_smp;  /* @@ -269,17 +267,15 @@ struct workqueue_struct {  };  struct workqueue_struct *system_wq __read_mostly; -struct workqueue_struct *system_long_wq __read_mostly; -struct workqueue_struct *system_nrt_wq __read_mostly; -struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezable_wq __read_mostly; -struct workqueue_struct *system_nrt_freezable_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_wq); +struct workqueue_struct *system_highpri_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_highpri_wq); +struct workqueue_struct *system_long_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_long_wq); -EXPORT_SYMBOL_GPL(system_nrt_wq); +struct workqueue_struct *system_unbound_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_freezable_wq __read_mostly;  EXPORT_SYMBOL_GPL(system_freezable_wq); -EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);  #define CREATE_TRACE_POINTS  #include <trace/events/workqueue.h> @@ -534,18 +530,24 @@ static int work_next_color(int color)  }  /* - * A work's data points to the cwq with WORK_STRUCT_CWQ set while the - * work is on queue.  Once execution starts, WORK_STRUCT_CWQ is - * cleared and the work data contains the cpu number it was last on. + * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data + * contain the pointer to the queued cwq.  Once execution starts, the flag + * is cleared and the high bits contain OFFQ flags and CPU number.   * - * set_work_{cwq|cpu}() and clear_work_data() can be used to set the - * cwq, cpu or clear work->data.  These functions should only be - * called while the work is owned - ie. while the PENDING bit is set. + * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() + * and clear_work_data() can be used to set the cwq, cpu or clear + * work->data.  These functions should only be called while the work is + * owned - ie. while the PENDING bit is set.   * - * get_work_[g]cwq() can be used to obtain the gcwq or cwq - * corresponding to a work.  gcwq is available once the work has been - * queued anywhere after initialization.  cwq is available only from - * queueing until execution starts. + * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to + * a work.  gcwq is available once the work has been queued anywhere after + * initialization until it is sync canceled.  cwq is available only while + * the work item is queued. + * + * %WORK_OFFQ_CANCELING is used to mark a work item which is being + * canceled.  While being canceled, a work item may have its PENDING set + * but stay off timer and worklist for arbitrarily long and nobody should + * try to steal the PENDING bit.   */  static inline void set_work_data(struct work_struct *work, unsigned long data,  				 unsigned long flags) @@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work,  		      WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags);  } -static void set_work_cpu(struct work_struct *work, unsigned int cpu) +static void set_work_cpu_and_clear_pending(struct work_struct *work, +					   unsigned int cpu)  { -	set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +	/* +	 * The following wmb is paired with the implied mb in +	 * test_and_set_bit(PENDING) and ensures all updates to @work made +	 * here are visible to and precede any updates by the next PENDING +	 * owner. +	 */ +	smp_wmb(); +	set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0);  }  static void clear_work_data(struct work_struct *work)  { +	smp_wmb();	/* see set_work_cpu_and_clear_pending() */  	set_work_data(work, WORK_STRUCT_NO_CPU, 0);  } @@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)  		return ((struct cpu_workqueue_struct *)  			(data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; -	cpu = data >> WORK_STRUCT_FLAG_BITS; +	cpu = data >> WORK_OFFQ_CPU_SHIFT;  	if (cpu == WORK_CPU_NONE)  		return NULL; @@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work)  	return get_gcwq(cpu);  } +static void mark_work_canceling(struct work_struct *work) +{ +	struct global_cwq *gcwq = get_work_gcwq(work); +	unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; + +	set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, +		      WORK_STRUCT_PENDING); +} + +static bool work_is_canceling(struct work_struct *work) +{ +	unsigned long data = atomic_long_read(&work->data); + +	return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); +} +  /*   * Policy functions.  These define the policies on how the global worker   * pools are managed.  Unless noted otherwise, these functions assume that @@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool)  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */  	int nr_busy = pool->nr_workers - nr_idle; +	/* +	 * nr_idle and idle_list may disagree if idle rebinding is in +	 * progress.  Never return %true if idle_list is empty. +	 */ +	if (list_empty(&pool->idle_list)) +		return false; +  	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;  } @@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq,  }  /** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head.  Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work.  This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, +			      struct work_struct **nextp) +{ +	struct work_struct *n; + +	/* +	 * Linked worklist will always end before the end of the list, +	 * use NULL for list head. +	 */ +	list_for_each_entry_safe_from(work, n, NULL, entry) { +		list_move_tail(&work->entry, head); +		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) +			break; +	} + +	/* +	 * If we're already inside safe list traversal and have moved +	 * multiple works to the scheduled queue, the next position +	 * needs to be updated. +	 */ +	if (nextp) +		*nextp = n; +} + +static void cwq_activate_delayed_work(struct work_struct *work) +{ +	struct cpu_workqueue_struct *cwq = get_work_cwq(work); + +	trace_workqueue_activate_work(work); +	move_linked_works(work, &cwq->pool->worklist, NULL); +	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); +	cwq->nr_active++; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ +	struct work_struct *work = list_first_entry(&cwq->delayed_works, +						    struct work_struct, entry); + +	cwq_activate_delayed_work(work); +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ +	/* ignore uncolored works */ +	if (color == WORK_NO_COLOR) +		return; + +	cwq->nr_in_flight[color]--; + +	cwq->nr_active--; +	if (!list_empty(&cwq->delayed_works)) { +		/* one down, submit a delayed one */ +		if (cwq->nr_active < cwq->max_active) +			cwq_activate_first_delayed(cwq); +	} + +	/* is flush in progress and are we at the flushing tip? */ +	if (likely(cwq->flush_color != color)) +		return; + +	/* are there still in-flight works? */ +	if (cwq->nr_in_flight[color]) +		return; + +	/* this cwq is done, clear flush_color */ +	cwq->flush_color = -1; + +	/* +	 * If this was the last cwq, wake up the first flusher.  It +	 * will handle the rest. +	 */ +	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) +		complete(&cwq->wq->first_flusher->done); +} + +/** + * try_to_grab_pending - steal work item from worklist and disable irq + * @work: work item to steal + * @is_dwork: @work is a delayed_work + * @flags: place to store irq state + * + * Try to grab PENDING bit of @work.  This function can handle @work in any + * stable state - idle, on timer or on worklist.  Return values are + * + *  1		if @work was pending and we successfully stole PENDING + *  0		if @work was idle and we claimed PENDING + *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry + *  -ENOENT	if someone else is canceling @work, this state may persist + *		for arbitrarily long + * + * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting + * interrupted while holding PENDING and @work off queue, irq must be + * disabled on entry.  This, combined with delayed_work->timer being + * irqsafe, ensures that we return -EAGAIN for finite short period of time. + * + * On successful return, >= 0, irq is disabled and the caller is + * responsible for releasing it using local_irq_restore(*@flags). + * + * This function is safe to call from any context including IRQ handler. + */ +static int try_to_grab_pending(struct work_struct *work, bool is_dwork, +			       unsigned long *flags) +{ +	struct global_cwq *gcwq; + +	local_irq_save(*flags); + +	/* try to steal the timer if it exists */ +	if (is_dwork) { +		struct delayed_work *dwork = to_delayed_work(work); + +		/* +		 * dwork->timer is irqsafe.  If del_timer() fails, it's +		 * guaranteed that the timer is not queued anywhere and not +		 * running on the local CPU. +		 */ +		if (likely(del_timer(&dwork->timer))) +			return 1; +	} + +	/* try to claim PENDING the normal way */ +	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) +		return 0; + +	/* +	 * The queueing is in progress, or it is already queued. Try to +	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. +	 */ +	gcwq = get_work_gcwq(work); +	if (!gcwq) +		goto fail; + +	spin_lock(&gcwq->lock); +	if (!list_empty(&work->entry)) { +		/* +		 * This work is queued, but perhaps we locked the wrong gcwq. +		 * In that case we must see the new value after rmb(), see +		 * insert_work()->wmb(). +		 */ +		smp_rmb(); +		if (gcwq == get_work_gcwq(work)) { +			debug_work_deactivate(work); + +			/* +			 * A delayed work item cannot be grabbed directly +			 * because it might have linked NO_COLOR work items +			 * which, if left on the delayed_list, will confuse +			 * cwq->nr_active management later on and cause +			 * stall.  Make sure the work item is activated +			 * before grabbing. +			 */ +			if (*work_data_bits(work) & WORK_STRUCT_DELAYED) +				cwq_activate_delayed_work(work); + +			list_del_init(&work->entry); +			cwq_dec_nr_in_flight(get_work_cwq(work), +				get_work_color(work)); + +			spin_unlock(&gcwq->lock); +			return 1; +		} +	} +	spin_unlock(&gcwq->lock); +fail: +	local_irq_restore(*flags); +	if (work_is_canceling(work)) +		return -ENOENT; +	cpu_relax(); +	return -EAGAIN; +} + +/**   * insert_work - insert a work into gcwq   * @cwq: cwq @work belongs to   * @work: work to insert @@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	struct cpu_workqueue_struct *cwq;  	struct list_head *worklist;  	unsigned int work_flags; -	unsigned long flags; +	unsigned int req_cpu = cpu; + +	/* +	 * While a work item is PENDING && off queue, a task trying to +	 * steal the PENDING will busy-loop waiting for it to either get +	 * queued or lose PENDING.  Grabbing PENDING and queueing should +	 * happen with IRQ disabled. +	 */ +	WARN_ON_ONCE(!irqs_disabled());  	debug_work_activate(work); @@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	if (!(wq->flags & WQ_UNBOUND)) {  		struct global_cwq *last_gcwq; -		if (unlikely(cpu == WORK_CPU_UNBOUND)) +		if (cpu == WORK_CPU_UNBOUND)  			cpu = raw_smp_processor_id();  		/* -		 * It's multi cpu.  If @wq is non-reentrant and @work -		 * was previously on a different cpu, it might still -		 * be running there, in which case the work needs to -		 * be queued on that cpu to guarantee non-reentrance. +		 * It's multi cpu.  If @work was previously on a different +		 * cpu, it might still be running there, in which case the +		 * work needs to be queued on that cpu to guarantee +		 * non-reentrancy.  		 */  		gcwq = get_gcwq(cpu); -		if (wq->flags & WQ_NON_REENTRANT && -		    (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { +		last_gcwq = get_work_gcwq(work); + +		if (last_gcwq && last_gcwq != gcwq) {  			struct worker *worker; -			spin_lock_irqsave(&last_gcwq->lock, flags); +			spin_lock(&last_gcwq->lock);  			worker = find_worker_executing_work(last_gcwq, work); @@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  				gcwq = last_gcwq;  			else {  				/* meh... not running there, queue here */ -				spin_unlock_irqrestore(&last_gcwq->lock, flags); -				spin_lock_irqsave(&gcwq->lock, flags); +				spin_unlock(&last_gcwq->lock); +				spin_lock(&gcwq->lock);  			} -		} else -			spin_lock_irqsave(&gcwq->lock, flags); +		} else { +			spin_lock(&gcwq->lock); +		}  	} else {  		gcwq = get_gcwq(WORK_CPU_UNBOUND); -		spin_lock_irqsave(&gcwq->lock, flags); +		spin_lock(&gcwq->lock);  	}  	/* gcwq determined, get cwq and queue */  	cwq = get_cwq(gcwq->cpu, wq); -	trace_workqueue_queue_work(cpu, cwq, work); +	trace_workqueue_queue_work(req_cpu, cwq, work);  	if (WARN_ON(!list_empty(&work->entry))) { -		spin_unlock_irqrestore(&gcwq->lock, flags); +		spin_unlock(&gcwq->lock);  		return;  	} @@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,  	insert_work(cwq, work, worklist, work_flags); -	spin_unlock_irqrestore(&gcwq->lock, flags); +	spin_unlock(&gcwq->lock);  }  /** - * queue_work - queue work on a workqueue + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on   * @wq: workqueue to use   * @work: work to queue   * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise.   * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. + * We queue the work to a specific CPU, the caller must ensure it + * can't go away.   */ -int queue_work(struct workqueue_struct *wq, struct work_struct *work) +bool queue_work_on(int cpu, struct workqueue_struct *wq, +		   struct work_struct *work)  { -	int ret; +	bool ret = false; +	unsigned long flags; -	ret = queue_work_on(get_cpu(), wq, work); -	put_cpu(); +	local_irq_save(flags); + +	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { +		__queue_work(cpu, wq, work); +		ret = true; +	} +	local_irq_restore(flags);  	return ret;  } -EXPORT_SYMBOL_GPL(queue_work); +EXPORT_SYMBOL_GPL(queue_work_on);  /** - * queue_work_on - queue work on specific cpu - * @cpu: CPU number to execute work on + * queue_work - queue work on a workqueue   * @wq: workqueue to use   * @work: work to queue   * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise.   * - * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU.   */ -int -queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +bool queue_work(struct workqueue_struct *wq, struct work_struct *work)  { -	int ret = 0; - -	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { -		__queue_work(cpu, wq, work); -		ret = 1; -	} -	return ret; +	return queue_work_on(WORK_CPU_UNBOUND, wq, work);  } -EXPORT_SYMBOL_GPL(queue_work_on); +EXPORT_SYMBOL_GPL(queue_work); -static void delayed_work_timer_fn(unsigned long __data) +void delayed_work_timer_fn(unsigned long __data)  {  	struct delayed_work *dwork = (struct delayed_work *)__data;  	struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); -	__queue_work(smp_processor_id(), cwq->wq, &dwork->work); +	/* should have been called from irqsafe timer with irq already off */ +	__queue_work(dwork->cpu, cwq->wq, &dwork->work);  } +EXPORT_SYMBOL_GPL(delayed_work_timer_fn); -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Returns 0 if @work was already on a queue, non-zero otherwise. - */ -int queue_delayed_work(struct workqueue_struct *wq, -			struct delayed_work *dwork, unsigned long delay) +static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, +				struct delayed_work *dwork, unsigned long delay)  { -	if (delay == 0) -		return queue_work(wq, &dwork->work); +	struct timer_list *timer = &dwork->timer; +	struct work_struct *work = &dwork->work; +	unsigned int lcpu; + +	WARN_ON_ONCE(timer->function != delayed_work_timer_fn || +		     timer->data != (unsigned long)dwork); +	BUG_ON(timer_pending(timer)); +	BUG_ON(!list_empty(&work->entry)); + +	timer_stats_timer_set_start_info(&dwork->timer); + +	/* +	 * This stores cwq for the moment, for the timer_fn.  Note that the +	 * work's gcwq is preserved to allow reentrance detection for +	 * delayed works. +	 */ +	if (!(wq->flags & WQ_UNBOUND)) { +		struct global_cwq *gcwq = get_work_gcwq(work); -	return queue_delayed_work_on(-1, wq, dwork, delay); +		/* +		 * If we cannot get the last gcwq from @work directly, +		 * select the last CPU such that it avoids unnecessarily +		 * triggering non-reentrancy check in __queue_work(). +		 */ +		lcpu = cpu; +		if (gcwq) +			lcpu = gcwq->cpu; +		if (lcpu == WORK_CPU_UNBOUND) +			lcpu = raw_smp_processor_id(); +	} else { +		lcpu = WORK_CPU_UNBOUND; +	} + +	set_work_cwq(work, get_cwq(lcpu, wq), 0); + +	dwork->cpu = cpu; +	timer->expires = jiffies + delay; + +	if (unlikely(cpu != WORK_CPU_UNBOUND)) +		add_timer_on(timer, cpu); +	else +		add_timer(timer);  } -EXPORT_SYMBOL_GPL(queue_delayed_work);  /**   * queue_delayed_work_on - queue work on specific CPU after delay @@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work);   * @dwork: work to queue   * @delay: number of jiffies to wait before queueing   * - * Returns 0 if @work was already on a queue, non-zero otherwise. + * Returns %false if @work was already on a queue, %true otherwise.  If + * @delay is zero and @dwork is idle, it will be scheduled for immediate + * execution.   */ -int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, -			struct delayed_work *dwork, unsigned long delay) +bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, +			   struct delayed_work *dwork, unsigned long delay)  { -	int ret = 0; -	struct timer_list *timer = &dwork->timer;  	struct work_struct *work = &dwork->work; +	bool ret = false; +	unsigned long flags; -	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { -		unsigned int lcpu; +	if (!delay) +		return queue_work_on(cpu, wq, &dwork->work); -		BUG_ON(timer_pending(timer)); -		BUG_ON(!list_empty(&work->entry)); +	/* read the comment in __queue_work() */ +	local_irq_save(flags); -		timer_stats_timer_set_start_info(&dwork->timer); +	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { +		__queue_delayed_work(cpu, wq, dwork, delay); +		ret = true; +	} -		/* -		 * This stores cwq for the moment, for the timer_fn. -		 * Note that the work's gcwq is preserved to allow -		 * reentrance detection for delayed works. -		 */ -		if (!(wq->flags & WQ_UNBOUND)) { -			struct global_cwq *gcwq = get_work_gcwq(work); +	local_irq_restore(flags); +	return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); -			if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) -				lcpu = gcwq->cpu; -			else -				lcpu = raw_smp_processor_id(); -		} else -			lcpu = WORK_CPU_UNBOUND; +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +bool queue_delayed_work(struct workqueue_struct *wq, +			struct delayed_work *dwork, unsigned long delay) +{ +	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); -		set_work_cwq(work, get_cwq(lcpu, wq), 0); +/** + * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, + * modify @dwork's timer so that it expires after @delay.  If @delay is + * zero, @work is guaranteed to be scheduled immediately regardless of its + * current state. + * + * Returns %false if @dwork was idle and queued, %true if @dwork was + * pending and its timer was modified. + * + * This function is safe to call from any context including IRQ handler. + * See try_to_grab_pending() for details. + */ +bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, +			 struct delayed_work *dwork, unsigned long delay) +{ +	unsigned long flags; +	int ret; -		timer->expires = jiffies + delay; -		timer->data = (unsigned long)dwork; -		timer->function = delayed_work_timer_fn; +	do { +		ret = try_to_grab_pending(&dwork->work, true, &flags); +	} while (unlikely(ret == -EAGAIN)); -		if (unlikely(cpu >= 0)) -			add_timer_on(timer, cpu); -		else -			add_timer(timer); -		ret = 1; +	if (likely(ret >= 0)) { +		__queue_delayed_work(cpu, wq, dwork, delay); +		local_irq_restore(flags);  	} + +	/* -ENOENT from try_to_grab_pending() becomes %true */  	return ret;  } -EXPORT_SYMBOL_GPL(queue_delayed_work_on); +EXPORT_SYMBOL_GPL(mod_delayed_work_on); + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, +		      unsigned long delay) +{ +	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(mod_delayed_work);  /**   * worker_enter_idle - enter idle state @@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock)  	}  } -struct idle_rebind { -	int			cnt;		/* # workers to be rebound */ -	struct completion	done;		/* all workers rebound */ -}; -  /* - * Rebind an idle @worker to its CPU.  During CPU onlining, this has to - * happen synchronously for idle workers.  worker_thread() will test - * %WORKER_REBIND before leaving idle and call this function. + * Rebind an idle @worker to its CPU.  worker_thread() will test + * list_empty(@worker->entry) before leaving idle and call this function.   */  static void idle_worker_rebind(struct worker *worker)  {  	struct global_cwq *gcwq = worker->pool->gcwq; -	/* CPU must be online at this point */ -	WARN_ON(!worker_maybe_bind_and_lock(worker)); -	if (!--worker->idle_rebind->cnt) -		complete(&worker->idle_rebind->done); -	spin_unlock_irq(&worker->pool->gcwq->lock); +	/* CPU may go down again inbetween, clear UNBOUND only on success */ +	if (worker_maybe_bind_and_lock(worker)) +		worker_clr_flags(worker, WORKER_UNBOUND); -	/* we did our part, wait for rebind_workers() to finish up */ -	wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); - -	/* -	 * rebind_workers() shouldn't finish until all workers passed the -	 * above WORKER_REBIND wait.  Tell it when done. -	 */ -	spin_lock_irq(&worker->pool->gcwq->lock); -	if (!--worker->idle_rebind->cnt) -		complete(&worker->idle_rebind->done); -	spin_unlock_irq(&worker->pool->gcwq->lock); +	/* rebind complete, become available again */ +	list_add(&worker->entry, &worker->pool->idle_list); +	spin_unlock_irq(&gcwq->lock);  }  /* @@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work)  	struct worker *worker = container_of(work, struct worker, rebind_work);  	struct global_cwq *gcwq = worker->pool->gcwq; -	worker_maybe_bind_and_lock(worker); - -	/* -	 * %WORKER_REBIND must be cleared even if the above binding failed; -	 * otherwise, we may confuse the next CPU_UP cycle or oops / get -	 * stuck by calling idle_worker_rebind() prematurely.  If CPU went -	 * down again inbetween, %WORKER_UNBOUND would be set, so clearing -	 * %WORKER_REBIND is always safe. -	 */ -	worker_clr_flags(worker, WORKER_REBIND); +	if (worker_maybe_bind_and_lock(worker)) +		worker_clr_flags(worker, WORKER_UNBOUND);  	spin_unlock_irq(&gcwq->lock);  } @@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work)   * @gcwq->cpu is coming online.  Rebind all workers to the CPU.  Rebinding   * is different for idle and busy ones.   * - * The idle ones should be rebound synchronously and idle rebinding should - * be complete before any worker starts executing work items with - * concurrency management enabled; otherwise, scheduler may oops trying to - * wake up non-local idle worker from wq_worker_sleeping(). + * Idle ones will be removed from the idle_list and woken up.  They will + * add themselves back after completing rebind.  This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management.   * - * This is achieved by repeatedly requesting rebinding until all idle - * workers are known to have been rebound under @gcwq->lock and holding all - * idle workers from becoming busy until idle rebinding is complete. + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough.  Note that nr_running will be properly bumped as busy workers + * rebind.   * - * Once idle workers are rebound, busy workers can be rebound as they - * finish executing their current work items.  Queueing the rebind work at - * the head of their scheduled lists is enough.  Note that nr_running will - * be properbly bumped as busy workers rebind. - * - * On return, all workers are guaranteed to either be bound or have rebind - * work item scheduled. + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case.  Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe.   */  static void rebind_workers(struct global_cwq *gcwq) -	__releases(&gcwq->lock) __acquires(&gcwq->lock)  { -	struct idle_rebind idle_rebind;  	struct worker_pool *pool; -	struct worker *worker; +	struct worker *worker, *n;  	struct hlist_node *pos;  	int i;  	lockdep_assert_held(&gcwq->lock);  	for_each_worker_pool(pool, gcwq) -		lockdep_assert_held(&pool->manager_mutex); +		lockdep_assert_held(&pool->assoc_mutex); -	/* -	 * Rebind idle workers.  Interlocked both ways.  We wait for -	 * workers to rebind via @idle_rebind.done.  Workers will wait for -	 * us to finish up by watching %WORKER_REBIND. -	 */ -	init_completion(&idle_rebind.done); -retry: -	idle_rebind.cnt = 1; -	INIT_COMPLETION(idle_rebind.done); - -	/* set REBIND and kick idle ones, we'll wait for these later */ +	/* dequeue and kick idle ones */  	for_each_worker_pool(pool, gcwq) { -		list_for_each_entry(worker, &pool->idle_list, entry) { -			unsigned long worker_flags = worker->flags; - -			if (worker->flags & WORKER_REBIND) -				continue; - -			/* morph UNBOUND to REBIND atomically */ -			worker_flags &= ~WORKER_UNBOUND; -			worker_flags |= WORKER_REBIND; -			ACCESS_ONCE(worker->flags) = worker_flags; - -			idle_rebind.cnt++; -			worker->idle_rebind = &idle_rebind; +		list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { +			/* +			 * idle workers should be off @pool->idle_list +			 * until rebind is complete to avoid receiving +			 * premature local wake-ups. +			 */ +			list_del_init(&worker->entry); -			/* worker_thread() will call idle_worker_rebind() */ +			/* +			 * worker_thread() will see the above dequeuing +			 * and call idle_worker_rebind(). +			 */  			wake_up_process(worker->task);  		}  	} -	if (--idle_rebind.cnt) { -		spin_unlock_irq(&gcwq->lock); -		wait_for_completion(&idle_rebind.done); -		spin_lock_irq(&gcwq->lock); -		/* busy ones might have become idle while waiting, retry */ -		goto retry; -	} - -	/* all idle workers are rebound, rebind busy workers */ +	/* rebind busy workers */  	for_each_busy_worker(worker, i, pos, gcwq) {  		struct work_struct *rebind_work = &worker->rebind_work; -		unsigned long worker_flags = worker->flags; - -		/* morph UNBOUND to REBIND atomically */ -		worker_flags &= ~WORKER_UNBOUND; -		worker_flags |= WORKER_REBIND; -		ACCESS_ONCE(worker->flags) = worker_flags; +		struct workqueue_struct *wq;  		if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,  				     work_data_bits(rebind_work)))  			continue; -		/* wq doesn't matter, use the default one */  		debug_work_activate(rebind_work); -		insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, -			    worker->scheduled.next, -			    work_color_to_flags(WORK_NO_COLOR)); -	} - -	/* -	 * All idle workers are rebound and waiting for %WORKER_REBIND to -	 * be cleared inside idle_worker_rebind().  Clear and release. -	 * Clearing %WORKER_REBIND from this foreign context is safe -	 * because these workers are still guaranteed to be idle. -	 * -	 * We need to make sure all idle workers passed WORKER_REBIND wait -	 * in idle_worker_rebind() before returning; otherwise, workers can -	 * get stuck at the wait if hotplug cycle repeats. -	 */ -	idle_rebind.cnt = 1; -	INIT_COMPLETION(idle_rebind.done); - -	for_each_worker_pool(pool, gcwq) { -		list_for_each_entry(worker, &pool->idle_list, entry) { -			worker->flags &= ~WORKER_REBIND; -			idle_rebind.cnt++; -		} -	} -	wake_up_all(&gcwq->rebind_hold); +		/* +		 * wq doesn't really matter but let's keep @worker->pool +		 * and @cwq->pool consistent for sanity. +		 */ +		if (worker_pool_pri(worker->pool)) +			wq = system_highpri_wq; +		else +			wq = system_wq; -	if (--idle_rebind.cnt) { -		spin_unlock_irq(&gcwq->lock); -		wait_for_completion(&idle_rebind.done); -		spin_lock_irq(&gcwq->lock); +		insert_work(get_cwq(gcwq->cpu, wq), rebind_work, +			worker->scheduled.next, +			work_color_to_flags(WORK_NO_COLOR));  	}  } @@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker)  	 * grab %POOL_MANAGING_WORKERS to achieve this because that can  	 * lead to idle worker depletion (all become busy thinking someone  	 * else is managing) which in turn can result in deadlock under -	 * extreme circumstances.  Use @pool->manager_mutex to synchronize +	 * extreme circumstances.  Use @pool->assoc_mutex to synchronize  	 * manager against CPU hotplug.  	 * -	 * manager_mutex would always be free unless CPU hotplug is in +	 * assoc_mutex would always be free unless CPU hotplug is in  	 * progress.  trylock first without dropping @gcwq->lock.  	 */ -	if (unlikely(!mutex_trylock(&pool->manager_mutex))) { +	if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {  		spin_unlock_irq(&pool->gcwq->lock); -		mutex_lock(&pool->manager_mutex); +		mutex_lock(&pool->assoc_mutex);  		/*  		 * CPU hotplug could have happened while we were waiting -		 * for manager_mutex.  Hotplug itself can't handle us +		 * for assoc_mutex.  Hotplug itself can't handle us  		 * because manager isn't either on idle or busy list, and  		 * @gcwq's state and ours could have deviated.  		 * -		 * As hotplug is now excluded via manager_mutex, we can +		 * As hotplug is now excluded via assoc_mutex, we can  		 * simply try to bind.  It will succeed or fail depending  		 * on @gcwq's current state.  Try it and adjust  		 * %WORKER_UNBOUND accordingly. @@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker)  	ret |= maybe_create_worker(pool);  	pool->flags &= ~POOL_MANAGING_WORKERS; -	mutex_unlock(&pool->manager_mutex); +	mutex_unlock(&pool->assoc_mutex);  	return ret;  }  /** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out paramter for nested worklist walking - * - * Schedule linked works starting from @work to @head.  Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work.  This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, -			      struct work_struct **nextp) -{ -	struct work_struct *n; - -	/* -	 * Linked worklist will always end before the end of the list, -	 * use NULL for list head. -	 */ -	list_for_each_entry_safe_from(work, n, NULL, entry) { -		list_move_tail(&work->entry, head); -		if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) -			break; -	} - -	/* -	 * If we're already inside safe list traversal and have moved -	 * multiple works to the scheduled queue, the next position -	 * needs to be updated. -	 */ -	if (nextp) -		*nextp = n; -} - -static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) -{ -	struct work_struct *work = list_first_entry(&cwq->delayed_works, -						    struct work_struct, entry); - -	trace_workqueue_activate_work(work); -	move_linked_works(work, &cwq->pool->worklist, NULL); -	__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); -	cwq->nr_active++; -} - -/** - * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight - * @cwq: cwq of interest - * @color: color of work which left the queue - * @delayed: for a delayed work - * - * A work either has completed or is removed from pending queue, - * decrement nr_in_flight of its cwq and handle workqueue flushing. - * - * CONTEXT: - * spin_lock_irq(gcwq->lock). - */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, -				 bool delayed) -{ -	/* ignore uncolored works */ -	if (color == WORK_NO_COLOR) -		return; - -	cwq->nr_in_flight[color]--; - -	if (!delayed) { -		cwq->nr_active--; -		if (!list_empty(&cwq->delayed_works)) { -			/* one down, submit a delayed one */ -			if (cwq->nr_active < cwq->max_active) -				cwq_activate_first_delayed(cwq); -		} -	} - -	/* is flush in progress and are we at the flushing tip? */ -	if (likely(cwq->flush_color != color)) -		return; - -	/* are there still in-flight works? */ -	if (cwq->nr_in_flight[color]) -		return; - -	/* this cwq is done, clear flush_color */ -	cwq->flush_color = -1; - -	/* -	 * If this was the last cwq, wake up the first flusher.  It -	 * will handle the rest. -	 */ -	if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) -		complete(&cwq->wq->first_flusher->done); -} - -/**   * process_one_work - process single work   * @worker: self   * @work: work to process @@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock)  	 * necessary to avoid spurious warnings from rescuers servicing the  	 * unbound or a disassociated gcwq.  	 */ -	WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && +	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&  		     !(gcwq->flags & GCWQ_DISASSOCIATED) &&  		     raw_smp_processor_id() != gcwq->cpu); @@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock)  		return;  	} -	/* claim and process */ +	/* claim and dequeue */  	debug_work_deactivate(work);  	hlist_add_head(&worker->hentry, bwh);  	worker->current_work = work;  	worker->current_cwq = cwq;  	work_color = get_work_color(work); -	/* record the current cpu number in the work data and dequeue */ -	set_work_cpu(work, gcwq->cpu);  	list_del_init(&work->entry);  	/* @@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock)  	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))  		wake_up_worker(pool); +	/* +	 * Record the last CPU and clear PENDING which should be the last +	 * update to @work.  Also, do this inside @gcwq->lock so that +	 * PENDING and queued state changes happen together while IRQ is +	 * disabled. +	 */ +	set_work_cpu_and_clear_pending(work, gcwq->cpu); +  	spin_unlock_irq(&gcwq->lock); -	work_clear_pending(work);  	lock_map_acquire_read(&cwq->wq->lockdep_map);  	lock_map_acquire(&lockdep_map);  	trace_workqueue_execute_start(work); @@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock)  	lock_map_release(&cwq->wq->lockdep_map);  	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { -		printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " -		       "%s/0x%08x/%d\n", -		       current->comm, preempt_count(), task_pid_nr(current)); -		printk(KERN_ERR "    last function: "); -		print_symbol("%s\n", (unsigned long)f); +		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" +		       "     last function: %pf\n", +		       current->comm, preempt_count(), task_pid_nr(current), f);  		debug_show_held_locks(current);  		dump_stack();  	} @@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock)  	hlist_del_init(&worker->hentry);  	worker->current_work = NULL;  	worker->current_cwq = NULL; -	cwq_dec_nr_in_flight(cwq, work_color, false); +	cwq_dec_nr_in_flight(cwq, work_color);  }  /** @@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker)  woke_up:  	spin_lock_irq(&gcwq->lock); -	/* -	 * DIE can be set only while idle and REBIND set while busy has -	 * @worker->rebind_work scheduled.  Checking here is enough. -	 */ -	if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { +	/* we are off idle list if destruction or rebind is requested */ +	if (unlikely(list_empty(&worker->entry))) {  		spin_unlock_irq(&gcwq->lock); +		/* if DIE is set, destruction is requested */  		if (worker->flags & WORKER_DIE) {  			worker->task->flags &= ~PF_WQ_WORKER;  			return 0;  		} +		/* otherwise, rebind */  		idle_worker_rebind(worker);  		goto woke_up;  	} @@ -2645,8 +2795,8 @@ reflush:  		if (++flush_cnt == 10 ||  		    (flush_cnt % 100 == 0 && flush_cnt <= 1000)) -			pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", -				   wq->name, flush_cnt); +			pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", +				wq->name, flush_cnt);  		goto reflush;  	} @@ -2657,8 +2807,7 @@ reflush:  }  EXPORT_SYMBOL_GPL(drain_workqueue); -static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, -			     bool wait_executing) +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)  {  	struct worker *worker = NULL;  	struct global_cwq *gcwq; @@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,  		cwq = get_work_cwq(work);  		if (unlikely(!cwq || gcwq != cwq->pool->gcwq))  			goto already_gone; -	} else if (wait_executing) { +	} else {  		worker = find_worker_executing_work(gcwq, work);  		if (!worker)  			goto already_gone;  		cwq = worker->current_cwq; -	} else -		goto already_gone; +	}  	insert_wq_barrier(cwq, barr, work, worker);  	spin_unlock_irq(&gcwq->lock); @@ -2713,15 +2861,8 @@ already_gone:   * flush_work - wait for a work to finish executing the last queueing instance   * @work: the work to flush   * - * Wait until @work has finished execution.  This function considers - * only the last queueing instance of @work.  If @work has been - * enqueued across different CPUs on a non-reentrant workqueue or on - * multiple workqueues, @work might still be executing on return on - * some of the CPUs from earlier queueing. - * - * If @work was queued only on a non-reentrant, ordered or unbound - * workqueue, @work is guaranteed to be idle on return if it hasn't - * been requeued since flush started. + * Wait until @work has finished execution.  @work is guaranteed to be idle + * on return if it hasn't been requeued since flush started.   *   * RETURNS:   * %true if flush_work() waited for the work to finish execution, @@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work)  	lock_map_acquire(&work->lockdep_map);  	lock_map_release(&work->lockdep_map); -	if (start_flush_work(work, &barr, true)) { +	if (start_flush_work(work, &barr)) {  		wait_for_completion(&barr.done);  		destroy_work_on_stack(&barr.work);  		return true; -	} else -		return false; -} -EXPORT_SYMBOL_GPL(flush_work); - -static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) -{ -	struct wq_barrier barr; -	struct worker *worker; - -	spin_lock_irq(&gcwq->lock); - -	worker = find_worker_executing_work(gcwq, work); -	if (unlikely(worker)) -		insert_wq_barrier(worker->current_cwq, &barr, work, worker); - -	spin_unlock_irq(&gcwq->lock); - -	if (unlikely(worker)) { -		wait_for_completion(&barr.done); -		destroy_work_on_stack(&barr.work); -		return true; -	} else +	} else {  		return false; -} - -static bool wait_on_work(struct work_struct *work) -{ -	bool ret = false; -	int cpu; - -	might_sleep(); - -	lock_map_acquire(&work->lockdep_map); -	lock_map_release(&work->lockdep_map); - -	for_each_gcwq_cpu(cpu) -		ret |= wait_on_cpu_work(get_gcwq(cpu), work); -	return ret; -} - -/** - * flush_work_sync - wait until a work has finished execution - * @work: the work to flush - * - * Wait until @work has finished execution.  On return, it's - * guaranteed that all queueing instances of @work which happened - * before this function is called are finished.  In other words, if - * @work hasn't been requeued since this function was called, @work is - * guaranteed to be idle on return. - * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. - */ -bool flush_work_sync(struct work_struct *work) -{ -	struct wq_barrier barr; -	bool pending, waited; - -	/* we'll wait for executions separately, queue barr only if pending */ -	pending = start_flush_work(work, &barr, false); - -	/* wait for executions to finish */ -	waited = wait_on_work(work); - -	/* wait for the pending one */ -	if (pending) { -		wait_for_completion(&barr.done); -		destroy_work_on_stack(&barr.work);  	} - -	return pending || waited; -} -EXPORT_SYMBOL_GPL(flush_work_sync); - -/* - * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, - * so this work can't be re-armed in any way. - */ -static int try_to_grab_pending(struct work_struct *work) -{ -	struct global_cwq *gcwq; -	int ret = -1; - -	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) -		return 0; - -	/* -	 * The queueing is in progress, or it is already queued. Try to -	 * steal it from ->worklist without clearing WORK_STRUCT_PENDING. -	 */ -	gcwq = get_work_gcwq(work); -	if (!gcwq) -		return ret; - -	spin_lock_irq(&gcwq->lock); -	if (!list_empty(&work->entry)) { -		/* -		 * This work is queued, but perhaps we locked the wrong gcwq. -		 * In that case we must see the new value after rmb(), see -		 * insert_work()->wmb(). -		 */ -		smp_rmb(); -		if (gcwq == get_work_gcwq(work)) { -			debug_work_deactivate(work); -			list_del_init(&work->entry); -			cwq_dec_nr_in_flight(get_work_cwq(work), -				get_work_color(work), -				*work_data_bits(work) & WORK_STRUCT_DELAYED); -			ret = 1; -		} -	} -	spin_unlock_irq(&gcwq->lock); - -	return ret;  } +EXPORT_SYMBOL_GPL(flush_work); -static bool __cancel_work_timer(struct work_struct *work, -				struct timer_list* timer) +static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)  { +	unsigned long flags;  	int ret;  	do { -		ret = (timer && likely(del_timer(timer))); -		if (!ret) -			ret = try_to_grab_pending(work); -		wait_on_work(work); +		ret = try_to_grab_pending(work, is_dwork, &flags); +		/* +		 * If someone else is canceling, wait for the same event it +		 * would be waiting for before retrying. +		 */ +		if (unlikely(ret == -ENOENT)) +			flush_work(work);  	} while (unlikely(ret < 0)); +	/* tell other tasks trying to grab @work to back off */ +	mark_work_canceling(work); +	local_irq_restore(flags); + +	flush_work(work);  	clear_work_data(work);  	return ret;  } @@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work,   */  bool cancel_work_sync(struct work_struct *work)  { -	return __cancel_work_timer(work, NULL); +	return __cancel_work_timer(work, false);  }  EXPORT_SYMBOL_GPL(cancel_work_sync); @@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync);   */  bool flush_delayed_work(struct delayed_work *dwork)  { +	local_irq_disable();  	if (del_timer_sync(&dwork->timer)) -		__queue_work(raw_smp_processor_id(), +		__queue_work(dwork->cpu,  			     get_work_cwq(&dwork->work)->wq, &dwork->work); +	local_irq_enable();  	return flush_work(&dwork->work);  }  EXPORT_SYMBOL(flush_delayed_work);  /** - * flush_delayed_work_sync - wait for a dwork to finish - * @dwork: the delayed work to flush + * cancel_delayed_work - cancel a delayed work + * @dwork: delayed_work to cancel   * - * Delayed timer is cancelled and the pending work is queued for - * execution immediately.  Other than timer handling, its behavior - * is identical to flush_work_sync(). + * Kill off a pending delayed_work.  Returns %true if @dwork was pending + * and canceled; %false if wasn't pending.  Note that the work callback + * function may still be running on return, unless it returns %true and the + * work doesn't re-arm itself.  Explicitly flush or use + * cancel_delayed_work_sync() to wait on it.   * - * RETURNS: - * %true if flush_work_sync() waited for the work to finish execution, - * %false if it was already idle. + * This function is safe to call from any context including IRQ handler.   */ -bool flush_delayed_work_sync(struct delayed_work *dwork) +bool cancel_delayed_work(struct delayed_work *dwork)  { -	if (del_timer_sync(&dwork->timer)) -		__queue_work(raw_smp_processor_id(), -			     get_work_cwq(&dwork->work)->wq, &dwork->work); -	return flush_work_sync(&dwork->work); +	unsigned long flags; +	int ret; + +	do { +		ret = try_to_grab_pending(&dwork->work, true, &flags); +	} while (unlikely(ret == -EAGAIN)); + +	if (unlikely(ret < 0)) +		return false; + +	set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); +	local_irq_restore(flags); +	return ret;  } -EXPORT_SYMBOL(flush_delayed_work_sync); +EXPORT_SYMBOL(cancel_delayed_work);  /**   * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish @@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync);   */  bool cancel_delayed_work_sync(struct delayed_work *dwork)  { -	return __cancel_work_timer(&dwork->work, &dwork->timer); +	return __cancel_work_timer(&dwork->work, true);  }  EXPORT_SYMBOL(cancel_delayed_work_sync);  /** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns zero if @work was already on the kernel-global workqueue and - * non-zero otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -int schedule_work(struct work_struct *work) -{ -	return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/*   * schedule_work_on - put work task on a specific cpu   * @cpu: cpu to put the work task on   * @work: job to be done   *   * This puts a job on a specific cpu   */ -int schedule_work_on(int cpu, struct work_struct *work) +bool schedule_work_on(int cpu, struct work_struct *work)  {  	return queue_work_on(cpu, system_wq, work);  }  EXPORT_SYMBOL(schedule_work_on);  /** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution + * schedule_work - put work task in global workqueue + * @work: job to be done   * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise.   */ -int schedule_delayed_work(struct delayed_work *dwork, -					unsigned long delay) +bool schedule_work(struct work_struct *work)  { -	return queue_delayed_work(system_wq, dwork, delay); +	return queue_work(system_wq, work);  } -EXPORT_SYMBOL(schedule_delayed_work); +EXPORT_SYMBOL(schedule_work);  /**   * schedule_delayed_work_on - queue work in global workqueue on CPU after delay @@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work);   * After waiting for a given time this puts a job in the kernel-global   * workqueue on the specified CPU.   */ -int schedule_delayed_work_on(int cpu, -			struct delayed_work *dwork, unsigned long delay) +bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, +			      unsigned long delay)  {  	return queue_delayed_work_on(cpu, system_wq, dwork, delay);  }  EXPORT_SYMBOL(schedule_delayed_work_on);  /** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) +{ +	return queue_delayed_work(system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/**   * schedule_on_each_cpu - execute a function synchronously on each online CPU   * @func: the function to call   * @@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,  	int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE;  	if (max_active < 1 || max_active > lim) -		printk(KERN_WARNING "workqueue: max_active %d requested for %s " -		       "is out of range, clamping between %d and %d\n", -		       max_active, name, 1, lim); +		pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", +			max_active, name, 1, lim);  	return clamp_val(max_active, 1, lim);  } @@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq)  EXPORT_SYMBOL_GPL(destroy_workqueue);  /** + * cwq_set_max_active - adjust max_active of a cwq + * @cwq: target cpu_workqueue_struct + * @max_active: new max_active value. + * + * Set @cwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) +{ +	cwq->max_active = max_active; + +	while (!list_empty(&cwq->delayed_works) && +	       cwq->nr_active < cwq->max_active) +		cwq_activate_first_delayed(cwq); +} + +/**   * workqueue_set_max_active - adjust max_active of a workqueue   * @wq: target workqueue   * @max_active: new max_active value. @@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)  		if (!(wq->flags & WQ_FREEZABLE) ||  		    !(gcwq->flags & GCWQ_FREEZING)) -			get_cwq(gcwq->cpu, wq)->max_active = max_active; +			cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active);  		spin_unlock_irq(&gcwq->lock);  	} @@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy);   */  /* claim manager positions of all pools */ -static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) +static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq)  {  	struct worker_pool *pool;  	for_each_worker_pool(pool, gcwq) -		mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); +		mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools);  	spin_lock_irq(&gcwq->lock);  }  /* release manager positions */ -static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) +static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq)  {  	struct worker_pool *pool;  	spin_unlock_irq(&gcwq->lock);  	for_each_worker_pool(pool, gcwq) -		mutex_unlock(&pool->manager_mutex); +		mutex_unlock(&pool->assoc_mutex);  }  static void gcwq_unbind_fn(struct work_struct *work) @@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work)  	BUG_ON(gcwq->cpu != smp_processor_id()); -	gcwq_claim_management_and_lock(gcwq); +	gcwq_claim_assoc_and_lock(gcwq);  	/*  	 * We've claimed all manager positions.  Make all workers unbound @@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work)  	gcwq->flags |= GCWQ_DISASSOCIATED; -	gcwq_release_management_and_unlock(gcwq); +	gcwq_release_assoc_and_unlock(gcwq);  	/*  	 * Call schedule() so that we cross rq->lock and thus can guarantee @@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work)   * Workqueues should be brought up before normal priority CPU notifiers.   * This will be registered high priority CPU notifier.   */ -static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,  					       unsigned long action,  					       void *hcpu)  { @@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,  	case CPU_DOWN_FAILED:  	case CPU_ONLINE: -		gcwq_claim_management_and_lock(gcwq); +		gcwq_claim_assoc_and_lock(gcwq);  		gcwq->flags &= ~GCWQ_DISASSOCIATED;  		rebind_workers(gcwq); -		gcwq_release_management_and_unlock(gcwq); +		gcwq_release_assoc_and_unlock(gcwq);  		break;  	}  	return NOTIFY_OK; @@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,   * Workqueues should be brought down after normal priority CPU notifiers.   * This will be registered as low priority CPU notifier.   */ -static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,  						 unsigned long action,  						 void *hcpu)  { @@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,  	case CPU_DOWN_PREPARE:  		/* unbinding should happen on the local CPU */  		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); -		schedule_work_on(cpu, &unbind_work); +		queue_work_on(cpu, system_highpri_wq, &unbind_work);  		flush_work(&unbind_work);  		break;  	} @@ -3735,11 +3801,7 @@ void thaw_workqueues(void)  				continue;  			/* restore max_active and repopulate worklist */ -			cwq->max_active = wq->saved_max_active; - -			while (!list_empty(&cwq->delayed_works) && -			       cwq->nr_active < cwq->max_active) -				cwq_activate_first_delayed(cwq); +			cwq_set_max_active(cwq, wq->saved_max_active);  		}  		for_each_worker_pool(pool, gcwq) @@ -3759,8 +3821,12 @@ static int __init init_workqueues(void)  	unsigned int cpu;  	int i; +	/* make sure we have enough bits for OFFQ CPU number */ +	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < +		     WORK_CPU_LAST); +  	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); -	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); +	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);  	/* initialize gcwqs */  	for_each_gcwq_cpu(cpu) { @@ -3786,11 +3852,9 @@ static int __init init_workqueues(void)  			setup_timer(&pool->mayday_timer, gcwq_mayday_timeout,  				    (unsigned long)pool); -			mutex_init(&pool->manager_mutex); +			mutex_init(&pool->assoc_mutex);  			ida_init(&pool->worker_ida);  		} - -		init_waitqueue_head(&gcwq->rebind_hold);  	}  	/* create the initial worker */ @@ -3813,17 +3877,14 @@ static int __init init_workqueues(void)  	}  	system_wq = alloc_workqueue("events", 0, 0); +	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);  	system_long_wq = alloc_workqueue("events_long", 0, 0); -	system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);  	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,  					    WQ_UNBOUND_MAX_ACTIVE);  	system_freezable_wq = alloc_workqueue("events_freezable",  					      WQ_FREEZABLE, 0); -	system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", -			WQ_NON_REENTRANT | WQ_FREEZABLE, 0); -	BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || -	       !system_unbound_wq || !system_freezable_wq || -		!system_nrt_freezable_wq); +	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || +	       !system_unbound_wq || !system_freezable_wq);  	return 0;  }  early_initcall(init_workqueues);  |