diff options
Diffstat (limited to 'ipc/mqueue.c')
| -rw-r--r-- | ipc/mqueue.c | 292 | 
1 files changed, 232 insertions, 60 deletions
diff --git a/ipc/mqueue.c b/ipc/mqueue.c index a2757d4ab77..8ce57691e7b 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -24,6 +24,7 @@  #include <linux/mqueue.h>  #include <linux/msg.h>  #include <linux/skbuff.h> +#include <linux/vmalloc.h>  #include <linux/netlink.h>  #include <linux/syscalls.h>  #include <linux/audit.h> @@ -49,6 +50,12 @@  #define STATE_PENDING	1  #define STATE_READY	2 +struct posix_msg_tree_node { +	struct rb_node		rb_node; +	struct list_head	msg_list; +	int			priority; +}; +  struct ext_wait_queue {		/* queue of sleeping tasks */  	struct task_struct *task;  	struct list_head list; @@ -61,7 +68,8 @@ struct mqueue_inode_info {  	struct inode vfs_inode;  	wait_queue_head_t wait_q; -	struct msg_msg **messages; +	struct rb_root msg_tree; +	struct posix_msg_tree_node *node_cache;  	struct mq_attr attr;  	struct sigevent notify; @@ -109,6 +117,103 @@ static struct ipc_namespace *get_ns_from_inode(struct inode *inode)  	return ns;  } +/* Auxiliary functions to manipulate messages' list */ +static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) +{ +	struct rb_node **p, *parent = NULL; +	struct posix_msg_tree_node *leaf; + +	p = &info->msg_tree.rb_node; +	while (*p) { +		parent = *p; +		leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); + +		if (likely(leaf->priority == msg->m_type)) +			goto insert_msg; +		else if (msg->m_type < leaf->priority) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} +	if (info->node_cache) { +		leaf = info->node_cache; +		info->node_cache = NULL; +	} else { +		leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); +		if (!leaf) +			return -ENOMEM; +		rb_init_node(&leaf->rb_node); +		INIT_LIST_HEAD(&leaf->msg_list); +		info->qsize += sizeof(*leaf); +	} +	leaf->priority = msg->m_type; +	rb_link_node(&leaf->rb_node, parent, p); +	rb_insert_color(&leaf->rb_node, &info->msg_tree); +insert_msg: +	info->attr.mq_curmsgs++; +	info->qsize += msg->m_ts; +	list_add_tail(&msg->m_list, &leaf->msg_list); +	return 0; +} + +static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) +{ +	struct rb_node **p, *parent = NULL; +	struct posix_msg_tree_node *leaf; +	struct msg_msg *msg; + +try_again: +	p = &info->msg_tree.rb_node; +	while (*p) { +		parent = *p; +		/* +		 * During insert, low priorities go to the left and high to the +		 * right.  On receive, we want the highest priorities first, so +		 * walk all the way to the right. +		 */ +		p = &(*p)->rb_right; +	} +	if (!parent) { +		if (info->attr.mq_curmsgs) { +			pr_warn_once("Inconsistency in POSIX message queue, " +				     "no tree element, but supposedly messages " +				     "should exist!\n"); +			info->attr.mq_curmsgs = 0; +		} +		return NULL; +	} +	leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); +	if (unlikely(list_empty(&leaf->msg_list))) { +		pr_warn_once("Inconsistency in POSIX message queue, " +			     "empty leaf node but we haven't implemented " +			     "lazy leaf delete!\n"); +		rb_erase(&leaf->rb_node, &info->msg_tree); +		if (info->node_cache) { +			info->qsize -= sizeof(*leaf); +			kfree(leaf); +		} else { +			info->node_cache = leaf; +		} +		goto try_again; +	} else { +		msg = list_first_entry(&leaf->msg_list, +				       struct msg_msg, m_list); +		list_del(&msg->m_list); +		if (list_empty(&leaf->msg_list)) { +			rb_erase(&leaf->rb_node, &info->msg_tree); +			if (info->node_cache) { +				info->qsize -= sizeof(*leaf); +				kfree(leaf); +			} else { +				info->node_cache = leaf; +			} +		} +	} +	info->attr.mq_curmsgs--; +	info->qsize -= msg->m_ts; +	return msg; +} +  static struct inode *mqueue_get_inode(struct super_block *sb,  		struct ipc_namespace *ipc_ns, umode_t mode,  		struct mq_attr *attr) @@ -129,7 +234,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,  	if (S_ISREG(mode)) {  		struct mqueue_inode_info *info; -		unsigned long mq_bytes, mq_msg_tblsz; +		unsigned long mq_bytes, mq_treesize;  		inode->i_fop = &mqueue_file_operations;  		inode->i_size = FILENT_SIZE; @@ -143,20 +248,36 @@ static struct inode *mqueue_get_inode(struct super_block *sb,  		info->notify_user_ns = NULL;  		info->qsize = 0;  		info->user = NULL;	/* set when all is ok */ +		info->msg_tree = RB_ROOT; +		info->node_cache = NULL;  		memset(&info->attr, 0, sizeof(info->attr)); -		info->attr.mq_maxmsg = ipc_ns->mq_msg_max; -		info->attr.mq_msgsize = ipc_ns->mq_msgsize_max; +		info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, +					   ipc_ns->mq_msg_default); +		info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max, +					    ipc_ns->mq_msgsize_default);  		if (attr) {  			info->attr.mq_maxmsg = attr->mq_maxmsg;  			info->attr.mq_msgsize = attr->mq_msgsize;  		} -		mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *); -		info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL); -		if (!info->messages) -			goto out_inode; +		/* +		 * We used to allocate a static array of pointers and account +		 * the size of that array as well as one msg_msg struct per +		 * possible message into the queue size. That's no longer +		 * accurate as the queue is now an rbtree and will grow and +		 * shrink depending on usage patterns.  We can, however, still +		 * account one msg_msg struct per message, but the nodes are +		 * allocated depending on priority usage, and most programs +		 * only use one, or a handful, of priorities.  However, since +		 * this is pinned memory, we need to assume worst case, so +		 * that means the min(mq_maxmsg, max_priorities) * struct +		 * posix_msg_tree_node. +		 */ +		mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + +			min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * +			sizeof(struct posix_msg_tree_node); -		mq_bytes = (mq_msg_tblsz + -			(info->attr.mq_maxmsg * info->attr.mq_msgsize)); +		mq_bytes = mq_treesize + (info->attr.mq_maxmsg * +					  info->attr.mq_msgsize);  		spin_lock(&mq_lock);  		if (u->mq_bytes + mq_bytes < u->mq_bytes || @@ -247,9 +368,9 @@ static void mqueue_evict_inode(struct inode *inode)  {  	struct mqueue_inode_info *info;  	struct user_struct *user; -	unsigned long mq_bytes; -	int i; +	unsigned long mq_bytes, mq_treesize;  	struct ipc_namespace *ipc_ns; +	struct msg_msg *msg;  	clear_inode(inode); @@ -259,14 +380,19 @@ static void mqueue_evict_inode(struct inode *inode)  	ipc_ns = get_ns_from_inode(inode);  	info = MQUEUE_I(inode);  	spin_lock(&info->lock); -	for (i = 0; i < info->attr.mq_curmsgs; i++) -		free_msg(info->messages[i]); -	kfree(info->messages); +	while ((msg = msg_get(info)) != NULL) +		free_msg(msg); +	kfree(info->node_cache);  	spin_unlock(&info->lock);  	/* Total amount of bytes accounted for the mqueue */ -	mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *) -	    + info->attr.mq_msgsize); +	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + +		min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * +		sizeof(struct posix_msg_tree_node); + +	mq_bytes = mq_treesize + (info->attr.mq_maxmsg * +				  info->attr.mq_msgsize); +  	user = info->user;  	if (user) {  		spin_lock(&mq_lock); @@ -300,8 +426,9 @@ static int mqueue_create(struct inode *dir, struct dentry *dentry,  		error = -EACCES;  		goto out_unlock;  	} -	if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && -			!capable(CAP_SYS_RESOURCE)) { +	if (ipc_ns->mq_queues_count >= HARD_QUEUESMAX || +	    (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && +	     !capable(CAP_SYS_RESOURCE))) {  		error = -ENOSPC;  		goto out_unlock;  	} @@ -485,26 +612,6 @@ static struct ext_wait_queue *wq_get_first_waiter(  	return list_entry(ptr, struct ext_wait_queue, list);  } -/* Auxiliary functions to manipulate messages' list */ -static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info) -{ -	int k; - -	k = info->attr.mq_curmsgs - 1; -	while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) { -		info->messages[k + 1] = info->messages[k]; -		k--; -	} -	info->attr.mq_curmsgs++; -	info->qsize += ptr->m_ts; -	info->messages[k + 1] = ptr; -} - -static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) -{ -	info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts; -	return info->messages[info->attr.mq_curmsgs]; -}  static inline void set_cookie(struct sk_buff *skb, char code)  { @@ -585,24 +692,30 @@ static void remove_notification(struct mqueue_inode_info *info)  static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)  { +	int mq_treesize; +	unsigned long total_size; +  	if (attr->mq_maxmsg <= 0 || attr->mq_msgsize <= 0) -		return 0; +		return -EINVAL;  	if (capable(CAP_SYS_RESOURCE)) { -		if (attr->mq_maxmsg > HARD_MSGMAX) -			return 0; +		if (attr->mq_maxmsg > HARD_MSGMAX || +		    attr->mq_msgsize > HARD_MSGSIZEMAX) +			return -EINVAL;  	} else {  		if (attr->mq_maxmsg > ipc_ns->mq_msg_max ||  				attr->mq_msgsize > ipc_ns->mq_msgsize_max) -			return 0; +			return -EINVAL;  	}  	/* check for overflow */  	if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg) -		return 0; -	if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize -	    + sizeof (struct msg_msg *))) < -	    (unsigned long)(attr->mq_maxmsg * attr->mq_msgsize)) -		return 0; -	return 1; +		return -EOVERFLOW; +	mq_treesize = attr->mq_maxmsg * sizeof(struct msg_msg) + +		min_t(unsigned int, attr->mq_maxmsg, MQ_PRIO_MAX) * +		sizeof(struct posix_msg_tree_node); +	total_size = attr->mq_maxmsg * attr->mq_msgsize; +	if (total_size + mq_treesize < total_size) +		return -EOVERFLOW; +	return 0;  }  /* @@ -617,12 +730,21 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,  	int ret;  	if (attr) { -		if (!mq_attr_ok(ipc_ns, attr)) { -			ret = -EINVAL; +		ret = mq_attr_ok(ipc_ns, attr); +		if (ret)  			goto out; -		}  		/* store for use during create */  		dentry->d_fsdata = attr; +	} else { +		struct mq_attr def_attr; + +		def_attr.mq_maxmsg = min(ipc_ns->mq_msg_max, +					 ipc_ns->mq_msg_default); +		def_attr.mq_msgsize = min(ipc_ns->mq_msgsize_max, +					  ipc_ns->mq_msgsize_default); +		ret = mq_attr_ok(ipc_ns, &def_attr); +		if (ret) +			goto out;  	}  	mode &= ~current_umask(); @@ -837,7 +959,8 @@ static inline void pipelined_receive(struct mqueue_inode_info *info)  		wake_up_interruptible(&info->wait_q);  		return;  	} -	msg_insert(sender->msg, info); +	if (msg_insert(sender->msg, info)) +		return;  	list_del(&sender->list);  	sender->state = STATE_PENDING;  	wake_up_process(sender->task); @@ -857,7 +980,8 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,  	struct mqueue_inode_info *info;  	ktime_t expires, *timeout = NULL;  	struct timespec ts; -	int ret; +	struct posix_msg_tree_node *new_leaf = NULL; +	int ret = 0;  	if (u_abs_timeout) {  		int res = prepare_timeout(u_abs_timeout, &expires, &ts); @@ -905,34 +1029,60 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,  	msg_ptr->m_ts = msg_len;  	msg_ptr->m_type = msg_prio; +	/* +	 * msg_insert really wants us to have a valid, spare node struct so +	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will +	 * fall back to that if necessary. +	 */ +	if (!info->node_cache) +		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); +  	spin_lock(&info->lock); +	if (!info->node_cache && new_leaf) { +		/* Save our speculative allocation into the cache */ +		rb_init_node(&new_leaf->rb_node); +		INIT_LIST_HEAD(&new_leaf->msg_list); +		info->node_cache = new_leaf; +		info->qsize += sizeof(*new_leaf); +		new_leaf = NULL; +	} else { +		kfree(new_leaf); +	} +  	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {  		if (filp->f_flags & O_NONBLOCK) { -			spin_unlock(&info->lock);  			ret = -EAGAIN;  		} else {  			wait.task = current;  			wait.msg = (void *) msg_ptr;  			wait.state = STATE_NONE;  			ret = wq_sleep(info, SEND, timeout, &wait); +			/* +			 * wq_sleep must be called with info->lock held, and +			 * returns with the lock released +			 */ +			goto out_free;  		} -		if (ret < 0) -			free_msg(msg_ptr);  	} else {  		receiver = wq_get_first_waiter(info, RECV);  		if (receiver) {  			pipelined_send(info, msg_ptr, receiver);  		} else {  			/* adds message to the queue */ -			msg_insert(msg_ptr, info); +			ret = msg_insert(msg_ptr, info); +			if (ret) +				goto out_unlock;  			__do_notify(info);  		}  		inode->i_atime = inode->i_mtime = inode->i_ctime =  				CURRENT_TIME; -		spin_unlock(&info->lock); -		ret = 0;  	} +out_unlock: +	spin_unlock(&info->lock); +out_free: +	if (ret) +		free_msg(msg_ptr);  out_fput:  	fput(filp);  out: @@ -951,6 +1101,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  	struct ext_wait_queue wait;  	ktime_t expires, *timeout = NULL;  	struct timespec ts; +	struct posix_msg_tree_node *new_leaf = NULL;  	if (u_abs_timeout) {  		int res = prepare_timeout(u_abs_timeout, &expires, &ts); @@ -986,7 +1137,26 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,  		goto out_fput;  	} +	/* +	 * msg_insert really wants us to have a valid, spare node struct so +	 * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will +	 * fall back to that if necessary. +	 */ +	if (!info->node_cache) +		new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); +  	spin_lock(&info->lock); + +	if (!info->node_cache && new_leaf) { +		/* Save our speculative allocation into the cache */ +		rb_init_node(&new_leaf->rb_node); +		INIT_LIST_HEAD(&new_leaf->msg_list); +		info->node_cache = new_leaf; +		info->qsize += sizeof(*new_leaf); +	} else { +		kfree(new_leaf); +	} +  	if (info->attr.mq_curmsgs == 0) {  		if (filp->f_flags & O_NONBLOCK) {  			spin_unlock(&info->lock); @@ -1251,6 +1421,8 @@ int mq_init_ns(struct ipc_namespace *ns)  	ns->mq_queues_max    = DFLT_QUEUESMAX;  	ns->mq_msg_max       = DFLT_MSGMAX;  	ns->mq_msgsize_max   = DFLT_MSGSIZEMAX; +	ns->mq_msg_default   = DFLT_MSG; +	ns->mq_msgsize_default  = DFLT_MSGSIZE;  	ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns);  	if (IS_ERR(ns->mq_mnt)) {  |