diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/exit.c | 8 | ||||
| -rw-r--r-- | kernel/fork.c | 3 | ||||
| -rw-r--r-- | kernel/futex.c | 829 | ||||
| -rw-r--r-- | kernel/futex_compat.c | 11 | ||||
| -rw-r--r-- | kernel/rtmutex_common.h | 8 | 
5 files changed, 818 insertions, 41 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 3e8a0282e9a..ab06b9f88f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -926,6 +926,14 @@ fastcall NORET_TYPE void do_exit(long code)  	tsk->mempolicy = NULL;  #endif  	/* +	 * This must happen late, after the PID is not +	 * hashed anymore: +	 */ +	if (unlikely(!list_empty(&tsk->pi_state_list))) +		exit_pi_state_list(tsk); +	if (unlikely(current->pi_state_cache)) +		kfree(current->pi_state_cache); +	/*  	 * If DEBUG_MUTEXES is on, make sure we are holding no locks:  	 */  	mutex_debug_check_no_locks_held(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index b664a081fff..628198a4f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,  #ifdef CONFIG_COMPAT  	p->compat_robust_list = NULL;  #endif +	INIT_LIST_HEAD(&p->pi_state_list); +	p->pi_state_cache = NULL; +  	/*  	 * sigaltstack should be cleared when sharing the same VM  	 */ diff --git a/kernel/futex.c b/kernel/futex.c index 50356fb5d72..b305b7f8dad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,6 +12,10 @@   *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved   *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.   * + *  PI-futex support started by Ingo Molnar and Thomas Gleixner + *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + *   *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly   *  enough at me, Linus for the original (flawed) idea, Matthew   *  Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@  #include <linux/signal.h>  #include <asm/futex.h> +#include "rtmutex_common.h" +  #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)  /* @@ -75,6 +81,27 @@ union futex_key {  };  /* + * Priority Inheritance state: + */ +struct futex_pi_state { +	/* +	 * list of 'owned' pi_state instances - these have to be +	 * cleaned up in do_exit() if the task exits prematurely: +	 */ +	struct list_head list; + +	/* +	 * The PI object: +	 */ +	struct rt_mutex pi_mutex; + +	struct task_struct *owner; +	atomic_t refcount; + +	union futex_key key; +}; + +/*   * We use this hashed waitqueue instead of a normal wait_queue_t, so   * we can wake only the relevant ones (hashed queues may be shared).   * @@ -96,6 +123,10 @@ struct futex_q {  	/* For fd, sigio sent using these: */  	int fd;  	struct file *filp; + +	/* Optional priority inheritance state: */ +	struct futex_pi_state *pi_state; +	struct task_struct *task;  };  /* @@ -259,6 +290,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)  }  /* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ +	struct vm_area_struct * vma; +	struct mm_struct *mm = current->mm; + +	if (attempt >= 2 || !(vma = find_vma(mm, address)) || +	    vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) +		return -EFAULT; + +	switch (handle_mm_fault(mm, vma, address, 1)) { +	case VM_FAULT_MINOR: +		current->min_flt++; +		break; +	case VM_FAULT_MAJOR: +		current->maj_flt++; +		break; +	default: +		return -EFAULT; +	} +	return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ +	struct futex_pi_state *pi_state; + +	if (likely(current->pi_state_cache)) +		return 0; + +	pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + +	if (!pi_state) +		return -ENOMEM; + +	memset(pi_state, 0, sizeof(*pi_state)); +	INIT_LIST_HEAD(&pi_state->list); +	/* pi_mutex gets initialized later */ +	pi_state->owner = NULL; +	atomic_set(&pi_state->refcount, 1); + +	current->pi_state_cache = pi_state; + +	return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ +	struct futex_pi_state *pi_state = current->pi_state_cache; + +	WARN_ON(!pi_state); +	current->pi_state_cache = NULL; + +	return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ +	if (!atomic_dec_and_test(&pi_state->refcount)) +		return; + +	/* +	 * If pi_state->owner is NULL, the owner is most probably dying +	 * and has cleaned up the pi_state already +	 */ +	if (pi_state->owner) { +		spin_lock_irq(&pi_state->owner->pi_lock); +		list_del_init(&pi_state->list); +		spin_unlock_irq(&pi_state->owner->pi_lock); + +		rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); +	} + +	if (current->pi_state_cache) +		kfree(pi_state); +	else { +		/* +		 * pi_state->list is already empty. +		 * clear pi_state->owner. +		 * refcount is at 0 - put it back to 1. +		 */ +		pi_state->owner = NULL; +		atomic_set(&pi_state->refcount, 1); +		current->pi_state_cache = pi_state; +	} +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ +	struct task_struct *p; + +	read_lock(&tasklist_lock); +	p = find_task_by_pid(pid); +	if (!p) +		goto out_unlock; +	if ((current->euid != p->euid) && (current->euid != p->uid)) { +		p = NULL; +		goto out_unlock; +	} +	if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { +		p = NULL; +		goto out_unlock; +	} +	get_task_struct(p); +out_unlock: +	read_unlock(&tasklist_lock); + +	return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ +	struct futex_hash_bucket *hb; +	struct list_head *next, *head = &curr->pi_state_list; +	struct futex_pi_state *pi_state; +	union futex_key key; + +	/* +	 * We are a ZOMBIE and nobody can enqueue itself on +	 * pi_state_list anymore, but we have to be careful +	 * versus waiters unqueueing themselfs +	 */ +	spin_lock_irq(&curr->pi_lock); +	while (!list_empty(head)) { + +		next = head->next; +		pi_state = list_entry(next, struct futex_pi_state, list); +		key = pi_state->key; +		spin_unlock_irq(&curr->pi_lock); + +		hb = hash_futex(&key); +		spin_lock(&hb->lock); + +		spin_lock_irq(&curr->pi_lock); +		if (head->next != next) { +			spin_unlock(&hb->lock); +			continue; +		} + +		list_del_init(&pi_state->list); + +		WARN_ON(pi_state->owner != curr); + +		pi_state->owner = NULL; +		spin_unlock_irq(&curr->pi_lock); + +		rt_mutex_unlock(&pi_state->pi_mutex); + +		spin_unlock(&hb->lock); + +		spin_lock_irq(&curr->pi_lock); +	} +	spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ +	struct futex_pi_state *pi_state = NULL; +	struct futex_q *this, *next; +	struct list_head *head; +	struct task_struct *p; +	pid_t pid; + +	head = &hb->chain; + +	list_for_each_entry_safe(this, next, head, list) { +		if (match_futex (&this->key, &me->key)) { +			/* +			 * Another waiter already exists - bump up +			 * the refcount and return its pi_state: +			 */ +			pi_state = this->pi_state; +			atomic_inc(&pi_state->refcount); +			me->pi_state = pi_state; + +			return 0; +		} +	} + +	/* +	 * We are the first waiter - try to look up the real owner and +	 * attach the new pi_state to it: +	 */ +	pid = uval & FUTEX_TID_MASK; +	p = futex_find_get_task(pid); +	if (!p) +		return -ESRCH; + +	pi_state = alloc_pi_state(); + +	/* +	 * Initialize the pi_mutex in locked state and make 'p' +	 * the owner of it: +	 */ +	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + +	/* Store the key for possible exit cleanups: */ +	pi_state->key = me->key; + +	spin_lock_irq(&p->pi_lock); +	list_add(&pi_state->list, &p->pi_state_list); +	pi_state->owner = p; +	spin_unlock_irq(&p->pi_lock); + +	put_task_struct(p); + +	me->pi_state = pi_state; + +	return 0; +} + +/*   * The hash bucket lock must be held when this is called.   * Afterwards, the futex_q must not be accessed.   */ @@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q)  	q->lock_ptr = NULL;  } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ +	struct task_struct *new_owner; +	struct futex_pi_state *pi_state = this->pi_state; +	u32 curval, newval; + +	if (!pi_state) +		return -EINVAL; + +	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + +	/* +	 * This happens when we have stolen the lock and the original +	 * pending owner did not enqueue itself back on the rt_mutex. +	 * Thats not a tragedy. We know that way, that a lock waiter +	 * is on the fly. We make the futex_q waiter the pending owner. +	 */ +	if (!new_owner) +		new_owner = this->task; + +	/* +	 * We pass it to the next owner. (The WAITERS bit is always +	 * kept enabled while there is PI state around. We must also +	 * preserve the owner died bit.) +	 */ +	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + +	inc_preempt_count(); +	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); +	dec_preempt_count(); + +	if (curval == -EFAULT) +		return -EFAULT; +	if (curval != uval) +		return -EINVAL; + +	list_del_init(&pi_state->owner->pi_state_list); +	list_add(&pi_state->list, &new_owner->pi_state_list); +	pi_state->owner = new_owner; +	rt_mutex_unlock(&pi_state->pi_mutex); + +	return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ +	u32 oldval; + +	/* +	 * There is no waiter, so we unlock the futex. The owner died +	 * bit has not to be preserved here. We are the owner: +	 */ +	inc_preempt_count(); +	oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); +	dec_preempt_count(); + +	if (oldval == -EFAULT) +		return oldval; +	if (oldval != uval) +		return -EAGAIN; + +	return 0; +} +  /*   * Wake up all waiters hashed on the physical page that is mapped   * to this virtual address: @@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)  	list_for_each_entry_safe(this, next, head, list) {  		if (match_futex (&this->key, &key)) { +			if (this->pi_state) +				return -EINVAL;  			wake_futex(this);  			if (++ret >= nr_wake)  				break; @@ -385,27 +708,9 @@ retry:  		 * still holding the mmap_sem.  		 */  		if (attempt++) { -			struct vm_area_struct * vma; -			struct mm_struct *mm = current->mm; -			unsigned long address = (unsigned long)uaddr2; - -			ret = -EFAULT; -			if (attempt >= 2 || -			    !(vma = find_vma(mm, address)) || -			    vma->vm_start > address || -			    !(vma->vm_flags & VM_WRITE)) +			if (futex_handle_fault((unsigned long)uaddr2, +					       attempt))  				goto out; - -			switch (handle_mm_fault(mm, vma, address, 1)) { -			case VM_FAULT_MINOR: -				current->min_flt++; -				break; -			case VM_FAULT_MAJOR: -				current->maj_flt++; -				break; -			default: -				goto out; -			}  			goto retry;  		} @@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)  static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)  {  	list_add_tail(&q->list, &hb->chain); +	q->task = current;  	spin_unlock(&hb->lock);  } @@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q)  		}  		WARN_ON(list_empty(&q->list));  		list_del(&q->list); + +		BUG_ON(q->pi_state); +  		spin_unlock(lock_ptr);  		ret = 1;  	} @@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q)  	return ret;  } +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) +{ +	WARN_ON(list_empty(&q->list)); +	list_del(&q->list); + +	BUG_ON(!q->pi_state); +	free_pi_state(q->pi_state); +	q->pi_state = NULL; + +	spin_unlock(&hb->lock); + +	drop_key_refs(&q->key); +} +  static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)  { -	DECLARE_WAITQUEUE(wait, current); +	struct task_struct *curr = current; +	DECLARE_WAITQUEUE(wait, curr);  	struct futex_hash_bucket *hb;  	struct futex_q q;  	u32 uval;  	int ret; +	q.pi_state = NULL;   retry: -	down_read(¤t->mm->mmap_sem); +	down_read(&curr->mm->mmap_sem);  	ret = get_futex_key(uaddr, &q.key);  	if (unlikely(ret != 0)) @@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)  		 * If we would have faulted, release mmap_sem, fault it in and  		 * start all over again.  		 */ -		up_read(¤t->mm->mmap_sem); +		up_read(&curr->mm->mmap_sem);  		ret = get_user(uval, uaddr); @@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)  			goto retry;  		return ret;  	} -	if (uval != val) { -		ret = -EWOULDBLOCK; -		queue_unlock(&q, hb); -		goto out_release_sem; -	} +	ret = -EWOULDBLOCK; +	if (uval != val) +		goto out_unlock_release_sem;  	/* Only actually queue if *uaddr contained val.  */  	__queue_me(&q, hb); @@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)  	/*  	 * Now the futex is queued and we have checked the data, we  	 * don't want to hold mmap_sem while we sleep. -	 */	 -	up_read(¤t->mm->mmap_sem); +	 */ +	up_read(&curr->mm->mmap_sem);  	/*  	 * There might have been scheduling since the queue_me(), as we @@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)  	 */  	return -EINTR; + out_unlock_release_sem: +	queue_unlock(&q, hb); +   out_release_sem: +	up_read(&curr->mm->mmap_sem); +	return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, +			    struct hrtimer_sleeper *to) +{ +	struct task_struct *curr = current; +	struct futex_hash_bucket *hb; +	u32 uval, newval, curval; +	struct futex_q q; +	int ret, attempt = 0; + +	if (refill_pi_state_cache()) +		return -ENOMEM; + +	q.pi_state = NULL; + retry: +	down_read(&curr->mm->mmap_sem); + +	ret = get_futex_key(uaddr, &q.key); +	if (unlikely(ret != 0)) +		goto out_release_sem; + +	hb = queue_lock(&q, -1, NULL); + + retry_locked: +	/* +	 * To avoid races, we attempt to take the lock here again +	 * (by doing a 0 -> TID atomic cmpxchg), while holding all +	 * the locks. It will most likely not succeed. +	 */ +	newval = current->pid; + +	inc_preempt_count(); +	curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); +	dec_preempt_count(); + +	if (unlikely(curval == -EFAULT)) +		goto uaddr_faulted; + +	/* We own the lock already */ +	if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { +		if (!detect && 0) +			force_sig(SIGKILL, current); +		ret = -EDEADLK; +		goto out_unlock_release_sem; +	} + +	/* +	 * Surprise - we got the lock. Just return +	 * to userspace: +	 */ +	if (unlikely(!curval)) +		goto out_unlock_release_sem; + +	uval = curval; +	newval = uval | FUTEX_WAITERS; + +	inc_preempt_count(); +	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); +	dec_preempt_count(); + +	if (unlikely(curval == -EFAULT)) +		goto uaddr_faulted; +	if (unlikely(curval != uval)) +		goto retry_locked; + +	/* +	 * We dont have the lock. Look up the PI state (or create it if +	 * we are the first waiter): +	 */ +	ret = lookup_pi_state(uval, hb, &q); + +	if (unlikely(ret)) { +		/* +		 * There were no waiters and the owner task lookup +		 * failed. When the OWNER_DIED bit is set, then we +		 * know that this is a robust futex and we actually +		 * take the lock. This is safe as we are protected by +		 * the hash bucket lock. We also set the waiters bit +		 * unconditionally here, to simplify glibc handling of +		 * multiple tasks racing to acquire the lock and +		 * cleanup the problems which were left by the dead +		 * owner. +		 */ +		if (curval & FUTEX_OWNER_DIED) { +			uval = newval; +			newval = current->pid | +				FUTEX_OWNER_DIED | FUTEX_WAITERS; + +			inc_preempt_count(); +			curval = futex_atomic_cmpxchg_inatomic(uaddr, +							       uval, newval); +			dec_preempt_count(); + +			if (unlikely(curval == -EFAULT)) +				goto uaddr_faulted; +			if (unlikely(curval != uval)) +				goto retry_locked; +			ret = 0; +		} +		goto out_unlock_release_sem; +	} + +	/* +	 * Only actually queue now that the atomic ops are done: +	 */ +	__queue_me(&q, hb); + +	/* +	 * Now the futex is queued and we have checked the data, we +	 * don't want to hold mmap_sem while we sleep. +	 */ +	up_read(&curr->mm->mmap_sem); + +	WARN_ON(!q.pi_state); +	/* +	 * Block on the PI mutex: +	 */ +	if (!trylock) +		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); +	else { +		ret = rt_mutex_trylock(&q.pi_state->pi_mutex); +		/* Fixup the trylock return value: */ +		ret = ret ? 0 : -EWOULDBLOCK; +	} + +	down_read(&curr->mm->mmap_sem); +	hb = queue_lock(&q, -1, NULL); + +	/* +	 * Got the lock. We might not be the anticipated owner if we +	 * did a lock-steal - fix up the PI-state in that case. +	 */ +	if (!ret && q.pi_state->owner != curr) { +		u32 newtid = current->pid | FUTEX_WAITERS; + +		/* Owner died? */ +		if (q.pi_state->owner != NULL) { +			spin_lock_irq(&q.pi_state->owner->pi_lock); +			list_del_init(&q.pi_state->list); +			spin_unlock_irq(&q.pi_state->owner->pi_lock); +		} else +			newtid |= FUTEX_OWNER_DIED; + +		q.pi_state->owner = current; + +		spin_lock_irq(¤t->pi_lock); +		list_add(&q.pi_state->list, ¤t->pi_state_list); +		spin_unlock_irq(¤t->pi_lock); + +		/* Unqueue and drop the lock */ +		unqueue_me_pi(&q, hb); +		up_read(&curr->mm->mmap_sem); +		/* +		 * We own it, so we have to replace the pending owner +		 * TID. This must be atomic as we have preserve the +		 * owner died bit here. +		 */ +		ret = get_user(uval, uaddr); +		while (!ret) { +			newval = (uval & FUTEX_OWNER_DIED) | newtid; +			curval = futex_atomic_cmpxchg_inatomic(uaddr, +							       uval, newval); +			if (curval == -EFAULT) +				ret = -EFAULT; +			if (curval == uval) +				break; +			uval = curval; +		} +	} else { +		/* +		 * Catch the rare case, where the lock was released +		 * when we were on the way back before we locked +		 * the hash bucket. +		 */ +		if (ret && q.pi_state->owner == curr) { +			if (rt_mutex_trylock(&q.pi_state->pi_mutex)) +				ret = 0; +		} +		/* Unqueue and drop the lock */ +		unqueue_me_pi(&q, hb); +		up_read(&curr->mm->mmap_sem); +	} + +	if (!detect && ret == -EDEADLK && 0) +		force_sig(SIGKILL, current); + +	return ret; + + out_unlock_release_sem: +	queue_unlock(&q, hb); + + out_release_sem: +	up_read(&curr->mm->mmap_sem); +	return ret; + + uaddr_faulted: +	/* +	 * We have to r/w  *(int __user *)uaddr, but we can't modify it +	 * non-atomically.  Therefore, if get_user below is not +	 * enough, we need to handle the fault ourselves, while +	 * still holding the mmap_sem. +	 */ +	if (attempt++) { +		if (futex_handle_fault((unsigned long)uaddr, attempt)) +			goto out_unlock_release_sem; + +		goto retry_locked; +	} + +	queue_unlock(&q, hb); +	up_read(&curr->mm->mmap_sem); + +	ret = get_user(uval, uaddr); +	if (!ret && (uval != -EFAULT)) +		goto retry; + +	return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ +	struct hrtimer_sleeper timeout, *to = NULL; +	int ret; + +	restart->fn = do_no_restart_syscall; + +	if (restart->arg2 || restart->arg3) { +		to = &timeout; +		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); +		hrtimer_init_sleeper(to, current); +		to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | +			(u64) restart->arg0; +	} + +	pr_debug("lock_pi restart: %p, %d (%d)\n", +		 (u32 __user *)restart->arg0, current->pid); + +	ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, +			       0, to); + +	if (ret != -EINTR) +		return ret; + +	restart->fn = futex_lock_pi_restart; + +	/* The other values are filled in */ +	return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, +			 long nsec, int trylock) +{ +	struct hrtimer_sleeper timeout, *to = NULL; +	struct restart_block *restart; +	int ret; + +	if (sec != MAX_SCHEDULE_TIMEOUT) { +		to = &timeout; +		hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); +		hrtimer_init_sleeper(to, current); +		to->timer.expires = ktime_set(sec, nsec); +	} + +	ret = do_futex_lock_pi(uaddr, detect, trylock, to); + +	if (ret != -EINTR) +		return ret; + +	pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + +	restart = ¤t_thread_info()->restart_block; +	restart->fn = futex_lock_pi_restart; +	restart->arg0 = (unsigned long) uaddr; +	restart->arg1 = detect; +	if (to) { +		restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; +		restart->arg3 = to->timer.expires.tv64 >> 32; +	} else +		restart->arg2 = restart->arg3 = 0; + +	return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ +	struct futex_hash_bucket *hb; +	struct futex_q *this, *next; +	u32 uval; +	struct list_head *head; +	union futex_key key; +	int ret, attempt = 0; + +retry: +	if (get_user(uval, uaddr)) +		return -EFAULT; +	/* +	 * We release only a lock we actually own: +	 */ +	if ((uval & FUTEX_TID_MASK) != current->pid) +		return -EPERM; +	/* +	 * First take all the futex related locks: +	 */ +	down_read(¤t->mm->mmap_sem); + +	ret = get_futex_key(uaddr, &key); +	if (unlikely(ret != 0)) +		goto out; + +	hb = hash_futex(&key); +	spin_lock(&hb->lock); + +retry_locked: +	/* +	 * To avoid races, try to do the TID -> 0 atomic transition +	 * again. If it succeeds then we can return without waking +	 * anyone else up: +	 */ +	inc_preempt_count(); +	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); +	dec_preempt_count(); + +	if (unlikely(uval == -EFAULT)) +		goto pi_faulted; +	/* +	 * Rare case: we managed to release the lock atomically, +	 * no need to wake anyone else up: +	 */ +	if (unlikely(uval == current->pid)) +		goto out_unlock; + +	/* +	 * Ok, other tasks may need to be woken up - check waiters +	 * and do the wakeup if necessary: +	 */ +	head = &hb->chain; + +	list_for_each_entry_safe(this, next, head, list) { +		if (!match_futex (&this->key, &key)) +			continue; +		ret = wake_futex_pi(uaddr, uval, this); +		/* +		 * The atomic access to the futex value +		 * generated a pagefault, so retry the +		 * user-access and the wakeup: +		 */ +		if (ret == -EFAULT) +			goto pi_faulted; +		goto out_unlock; +	} +	/* +	 * No waiters - kernel unlocks the futex: +	 */ +	ret = unlock_futex_pi(uaddr, uval); +	if (ret == -EFAULT) +		goto pi_faulted; + +out_unlock: +	spin_unlock(&hb->lock); +out:  	up_read(¤t->mm->mmap_sem); + +	return ret; + +pi_faulted: +	/* +	 * We have to r/w  *(int __user *)uaddr, but we can't modify it +	 * non-atomically.  Therefore, if get_user below is not +	 * enough, we need to handle the fault ourselves, while +	 * still holding the mmap_sem. +	 */ +	if (attempt++) { +		if (futex_handle_fault((unsigned long)uaddr, attempt)) +			goto out_unlock; + +		goto retry_locked; +	} + +	spin_unlock(&hb->lock); +	up_read(¤t->mm->mmap_sem); + +	ret = get_user(uval, uaddr); +	if (!ret && (uval != -EFAULT)) +		goto retry; +  	return ret;  } @@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal)  		err = -ENOMEM;  		goto error;  	} +	q->pi_state = NULL;  	down_read(¤t->mm->mmap_sem);  	err = get_futex_key(uaddr, &q->key); @@ -856,7 +1591,7 @@ error:   * Implementation: user-space maintains a per-thread list of locks it   * is holding. Upon do_exit(), the kernel carefully walks this list,   * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is   * always manipulated with the lock held, so the list is private and   * per-thread. Userspace also maintains a per-thread 'list_op_pending'   * field, to allow the kernel to clean up if the thread dies after @@ -931,7 +1666,7 @@ err_unlock:   */  int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)  { -	u32 uval; +	u32 uval, nval;  retry:  	if (get_user(uval, uaddr)) @@ -948,8 +1683,12 @@ retry:  		 * thread-death.) The rest of the cleanup is done in  		 * userspace.  		 */ -		if (futex_atomic_cmpxchg_inatomic(uaddr, uval, -					 uval | FUTEX_OWNER_DIED) != uval) +		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, +						     uval | FUTEX_OWNER_DIED); +		if (nval == -EFAULT) +			return -1; + +		if (nval != uval)  			goto retry;  		if (uval & FUTEX_WAITERS) @@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr)  	while (entry != &head->list) {  		/*  		 * A pending lock might already be on the list, so -		 * dont process it twice: +		 * don't process it twice:  		 */  		if (entry != pending)  			if (handle_futex_death((void *)entry + futex_offset, @@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,  	case FUTEX_WAKE_OP:  		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);  		break; +	case FUTEX_LOCK_PI: +		ret = futex_lock_pi(uaddr, val, timeout, val2, 0); +		break; +	case FUTEX_UNLOCK_PI: +		ret = futex_unlock_pi(uaddr); +		break; +	case FUTEX_TRYLOCK_PI: +		ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); +		break;  	default:  		ret = -ENOSYS;  	} @@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,  	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;  	u32 val2 = 0; -	if (utime && (op == FUTEX_WAIT)) { +	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {  		if (copy_from_user(&t, utime, sizeof(t)) != 0)  			return -EFAULT;  		if (!timespec_valid(&t))  			return -EINVAL; -		timeout = timespec_to_jiffies(&t) + 1; +		if (op == FUTEX_WAIT) +			timeout = timespec_to_jiffies(&t) + 1; +		else { +			timeout = t.tv_sec; +			val2 = t.tv_nsec; +		}  	}  	/*  	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.  	 */ -	if (op >= FUTEX_REQUEUE) +	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)  		val2 = (u32) (unsigned long) utime;  	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e57c31670a..d1d92b441fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,  	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;  	int val2 = 0; -	if (utime && (op == FUTEX_WAIT)) { +	if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {  		if (get_compat_timespec(&t, utime))  			return -EFAULT;  		if (!timespec_valid(&t))  			return -EINVAL; -		timeout = timespec_to_jiffies(&t) + 1; +		if (op == FUTEX_WAIT) +			timeout = timespec_to_jiffies(&t) + 1; +		else { +			timeout = t.tv_sec; +			val2 = t.tv_nsec; +		}  	} -	if (op >= FUTEX_REQUEUE) +	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)  		val2 = (int) (unsigned long) utime;  	return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e068024eeff..9c75856e791 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)  	return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;  } +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, +				       struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, +				  struct task_struct *proxy_owner);  #endif  |