diff options
| -rw-r--r-- | include/net/af_unix.h | 3 | ||||
| -rw-r--r-- | net/unix/af_unix.c | 6 | ||||
| -rw-r--r-- | net/unix/garbage.c | 323 | 
3 files changed, 189 insertions, 143 deletions
diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 65f49fd7def..6de1e9e35c7 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -79,9 +79,10 @@ struct unix_sock {  	struct mutex		readlock;          struct sock		*peer;          struct sock		*other; -        struct sock		*gc_tree; +	struct list_head	link;          atomic_t                inflight;          spinlock_t		lock; +	unsigned int		gc_candidate : 1;          wait_queue_head_t       peer_wait;  };  #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3654b647ac7..65ebccc0a69 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -592,7 +592,8 @@ static struct sock * unix_create1(struct socket *sock)  	u->dentry = NULL;  	u->mnt	  = NULL;  	spin_lock_init(&u->lock); -	atomic_set(&u->inflight, sock ? 0 : -1); +	atomic_set(&u->inflight, 0); +	INIT_LIST_HEAD(&u->link);  	mutex_init(&u->readlock); /* single task reading lock */  	init_waitqueue_head(&u->peer_wait);  	unix_insert_socket(unix_sockets_unbound, sk); @@ -1134,9 +1135,6 @@ restart:  	/* take ten and and send info to listening sock */  	spin_lock(&other->sk_receive_queue.lock);  	__skb_queue_tail(&other->sk_receive_queue, skb); -	/* Undo artificially decreased inflight after embrion -	 * is installed to listening socket. */ -	atomic_inc(&newu->inflight);  	spin_unlock(&other->sk_receive_queue.lock);  	unix_state_unlock(other);  	other->sk_data_ready(other, 0); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f20b7ea7c55..406b6433e46 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -62,6 +62,10 @@   *	AV		1 Mar 1999   *		Damn. Added missing check for ->dead in listen queues scanning.   * + *	Miklos Szeredi 25 Jun 2007 + *		Reimplement with a cycle collecting algorithm. This should + *		solve several problems with the previous code, like being racy + *		wrt receive and holding up unrelated socket operations.   */  #include <linux/kernel.h> @@ -84,10 +88,9 @@  /* Internal data structures and random procedures: */ -#define GC_HEAD		((struct sock *)(-1)) -#define GC_ORPHAN	((struct sock *)(-3)) - -static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */ +static LIST_HEAD(gc_inflight_list); +static LIST_HEAD(gc_candidates); +static DEFINE_SPINLOCK(unix_gc_lock);  atomic_t unix_tot_inflight = ATOMIC_INIT(0); @@ -122,8 +125,16 @@ void unix_inflight(struct file *fp)  {  	struct sock *s = unix_get_socket(fp);  	if(s) { -		atomic_inc(&unix_sk(s)->inflight); +		struct unix_sock *u = unix_sk(s); +		spin_lock(&unix_gc_lock); +		if (atomic_inc_return(&u->inflight) == 1) { +			BUG_ON(!list_empty(&u->link)); +			list_add_tail(&u->link, &gc_inflight_list); +		} else { +			BUG_ON(list_empty(&u->link)); +		}  		atomic_inc(&unix_tot_inflight); +		spin_unlock(&unix_gc_lock);  	}  } @@ -131,182 +142,218 @@ void unix_notinflight(struct file *fp)  {  	struct sock *s = unix_get_socket(fp);  	if(s) { -		atomic_dec(&unix_sk(s)->inflight); +		struct unix_sock *u = unix_sk(s); +		spin_lock(&unix_gc_lock); +		BUG_ON(list_empty(&u->link)); +		if (atomic_dec_and_test(&u->inflight)) +			list_del_init(&u->link);  		atomic_dec(&unix_tot_inflight); +		spin_unlock(&unix_gc_lock);  	}  } - -/* - *	Garbage Collector Support Functions - */ - -static inline struct sock *pop_stack(void) +static inline struct sk_buff *sock_queue_head(struct sock *sk)  { -	struct sock *p = gc_current; -	gc_current = unix_sk(p)->gc_tree; -	return p; +	return (struct sk_buff *) &sk->sk_receive_queue;  } -static inline int empty_stack(void) +#define receive_queue_for_each_skb(sk, next, skb) \ +	for (skb = sock_queue_head(sk)->next, next = skb->next; \ +	     skb != sock_queue_head(sk); skb = next, next = skb->next) + +static void scan_inflight(struct sock *x, void (*func)(struct sock *), +			  struct sk_buff_head *hitlist)  { -	return gc_current == GC_HEAD; +	struct sk_buff *skb; +	struct sk_buff *next; + +	spin_lock(&x->sk_receive_queue.lock); +	receive_queue_for_each_skb(x, next, skb) { +		/* +		 *	Do we have file descriptors ? +		 */ +		if (UNIXCB(skb).fp) { +			bool hit = false; +			/* +			 *	Process the descriptors of this socket +			 */ +			int nfd = UNIXCB(skb).fp->count; +			struct file **fp = UNIXCB(skb).fp->fp; +			while (nfd--) { +				/* +				 *	Get the socket the fd matches +				 *	if it indeed does so +				 */ +				struct sock *sk = unix_get_socket(*fp++); +				if(sk) { +					hit = true; +					func(sk); +				} +			} +			if (hit && hitlist != NULL) { +				__skb_unlink(skb, &x->sk_receive_queue); +				__skb_queue_tail(hitlist, skb); +			} +		} +	} +	spin_unlock(&x->sk_receive_queue.lock);  } -static void maybe_unmark_and_push(struct sock *x) +static void scan_children(struct sock *x, void (*func)(struct sock *), +			  struct sk_buff_head *hitlist)  { -	struct unix_sock *u = unix_sk(x); +	if (x->sk_state != TCP_LISTEN) +		scan_inflight(x, func, hitlist); +	else { +		struct sk_buff *skb; +		struct sk_buff *next; +		struct unix_sock *u; +		LIST_HEAD(embryos); + +		/* +		 * For a listening socket collect the queued embryos +		 * and perform a scan on them as well. +		 */ +		spin_lock(&x->sk_receive_queue.lock); +		receive_queue_for_each_skb(x, next, skb) { +			u = unix_sk(skb->sk); + +			/* +			 * An embryo cannot be in-flight, so it's safe +			 * to use the list link. +			 */ +			BUG_ON(!list_empty(&u->link)); +			list_add_tail(&u->link, &embryos); +		} +		spin_unlock(&x->sk_receive_queue.lock); -	if (u->gc_tree != GC_ORPHAN) -		return; -	sock_hold(x); -	u->gc_tree = gc_current; -	gc_current = x; +		while (!list_empty(&embryos)) { +			u = list_entry(embryos.next, struct unix_sock, link); +			scan_inflight(&u->sk, func, hitlist); +			list_del_init(&u->link); +		} +	}  } +static void dec_inflight(struct sock *sk) +{ +	atomic_dec(&unix_sk(sk)->inflight); +} -/* The external entry point: unix_gc() */ +static void inc_inflight(struct sock *sk) +{ +	atomic_inc(&unix_sk(sk)->inflight); +} -void unix_gc(void) +static void inc_inflight_move_tail(struct sock *sk)  { -	static DEFINE_MUTEX(unix_gc_sem); -	int i; -	struct sock *s; -	struct sk_buff_head hitlist; -	struct sk_buff *skb; +	struct unix_sock *u = unix_sk(sk); +	atomic_inc(&u->inflight);  	/* -	 *	Avoid a recursive GC. +	 * If this is still a candidate, move it to the end of the +	 * list, so that it's checked even if it was already passed +	 * over  	 */ +	if (u->gc_candidate) +		list_move_tail(&u->link, &gc_candidates); +} -	if (!mutex_trylock(&unix_gc_sem)) -		return; +/* The external entry point: unix_gc() */ -	spin_lock(&unix_table_lock); +void unix_gc(void) +{ +	static bool gc_in_progress = false; -	forall_unix_sockets(i, s) -	{ -		unix_sk(s)->gc_tree = GC_ORPHAN; -	} -	/* -	 *	Everything is now marked -	 */ +	struct unix_sock *u; +	struct unix_sock *next; +	struct sk_buff_head hitlist; +	struct list_head cursor; -	/* Invariant to be maintained: -		- everything unmarked is either: -		-- (a) on the stack, or -		-- (b) has all of its children unmarked -		- everything on the stack is always unmarked -		- nothing is ever pushed onto the stack twice, because: -		-- nothing previously unmarked is ever pushed on the stack -	 */ +	spin_lock(&unix_gc_lock); +	/* Avoid a recursive GC. */ +	if (gc_in_progress) +		goto out; + +	gc_in_progress = true;  	/* -	 *	Push root set +	 * First, select candidates for garbage collection.  Only +	 * in-flight sockets are considered, and from those only ones +	 * which don't have any external reference. +	 * +	 * Holding unix_gc_lock will protect these candidates from +	 * being detached, and hence from gaining an external +	 * reference.  This also means, that since there are no +	 * possible receivers, the receive queues of these sockets are +	 * static during the GC, even though the dequeue is done +	 * before the detach without atomicity guarantees.  	 */ +	list_for_each_entry_safe(u, next, &gc_inflight_list, link) { +		int total_refs; +		int inflight_refs; -	forall_unix_sockets(i, s) -	{ -		int open_count = 0; +		total_refs = file_count(u->sk.sk_socket->file); +		inflight_refs = atomic_read(&u->inflight); -		/* -		 *	If all instances of the descriptor are not -		 *	in flight we are in use. -		 * -		 *	Special case: when socket s is embrion, it may be -		 *	hashed but still not in queue of listening socket. -		 *	In this case (see unix_create1()) we set artificial -		 *	negative inflight counter to close race window. -		 *	It is trick of course and dirty one. -		 */ -		if (s->sk_socket && s->sk_socket->file) -			open_count = file_count(s->sk_socket->file); -		if (open_count > atomic_read(&unix_sk(s)->inflight)) -			maybe_unmark_and_push(s); +		BUG_ON(inflight_refs < 1); +		BUG_ON(total_refs < inflight_refs); +		if (total_refs == inflight_refs) { +			list_move_tail(&u->link, &gc_candidates); +			u->gc_candidate = 1; +		}  	}  	/* -	 *	Mark phase +	 * Now remove all internal in-flight reference to children of +	 * the candidates.  	 */ +	list_for_each_entry(u, &gc_candidates, link) +		scan_children(&u->sk, dec_inflight, NULL); -	while (!empty_stack()) -	{ -		struct sock *x = pop_stack(); -		struct sock *sk; - -		spin_lock(&x->sk_receive_queue.lock); -		skb = skb_peek(&x->sk_receive_queue); +	/* +	 * Restore the references for children of all candidates, +	 * which have remaining references.  Do this recursively, so +	 * only those remain, which form cyclic references. +	 * +	 * Use a "cursor" link, to make the list traversal safe, even +	 * though elements might be moved about. +	 */ +	list_add(&cursor, &gc_candidates); +	while (cursor.next != &gc_candidates) { +		u = list_entry(cursor.next, struct unix_sock, link); -		/* -		 *	Loop through all but first born -		 */ +		/* Move cursor to after the current position. */ +		list_move(&cursor, &u->link); -		while (skb && skb != (struct sk_buff *)&x->sk_receive_queue) { -			/* -			 *	Do we have file descriptors ? -			 */ -			if(UNIXCB(skb).fp) -			{ -				/* -				 *	Process the descriptors of this socket -				 */ -				int nfd=UNIXCB(skb).fp->count; -				struct file **fp = UNIXCB(skb).fp->fp; -				while(nfd--) -				{ -					/* -					 *	Get the socket the fd matches if -					 *	it indeed does so -					 */ -					if((sk=unix_get_socket(*fp++))!=NULL) -					{ -						maybe_unmark_and_push(sk); -					} -				} -			} -			/* We have to scan not-yet-accepted ones too */ -			if (x->sk_state == TCP_LISTEN) -				maybe_unmark_and_push(skb->sk); -			skb=skb->next; +		if (atomic_read(&u->inflight) > 0) { +			list_move_tail(&u->link, &gc_inflight_list); +			u->gc_candidate = 0; +			scan_children(&u->sk, inc_inflight_move_tail, NULL);  		} -		spin_unlock(&x->sk_receive_queue.lock); -		sock_put(x);  	} +	list_del(&cursor); +	/* +	 * Now gc_candidates contains only garbage.  Restore original +	 * inflight counters for these as well, and remove the skbuffs +	 * which are creating the cycle(s). +	 */  	skb_queue_head_init(&hitlist); +	list_for_each_entry(u, &gc_candidates, link) +		scan_children(&u->sk, inc_inflight, &hitlist); -	forall_unix_sockets(i, s) -	{ -		struct unix_sock *u = unix_sk(s); +	spin_unlock(&unix_gc_lock); -		if (u->gc_tree == GC_ORPHAN) { -			struct sk_buff *nextsk; +	/* Here we are. Hitlist is filled. Die. */ +	__skb_queue_purge(&hitlist); -			spin_lock(&s->sk_receive_queue.lock); -			skb = skb_peek(&s->sk_receive_queue); -			while (skb && -			       skb != (struct sk_buff *)&s->sk_receive_queue) { -				nextsk = skb->next; -				/* -				 *	Do we have file descriptors ? -				 */ -				if (UNIXCB(skb).fp) { -					__skb_unlink(skb, -						     &s->sk_receive_queue); -					__skb_queue_tail(&hitlist, skb); -				} -				skb = nextsk; -			} -			spin_unlock(&s->sk_receive_queue.lock); -		} -		u->gc_tree = GC_ORPHAN; -	} -	spin_unlock(&unix_table_lock); +	spin_lock(&unix_gc_lock); -	/* -	 *	Here we are. Hitlist is filled. Die. -	 */ +	/* All candidates should have been detached by now. */ +	BUG_ON(!list_empty(&gc_candidates)); +	gc_in_progress = false; -	__skb_queue_purge(&hitlist); -	mutex_unlock(&unix_gc_sem); + out: +	spin_unlock(&unix_gc_lock);  }  |