futex: Handle faults correctly for PI futexes

commit 34b1a1ce1458f50ef27c54e28eb9b1947012907a upstream

fixup_pi_state_owner() tries to ensure that the state of the rtmutex,
pi_state and the user space value related to the PI futex are consistent
before returning to user space. In case that the user space value update
faults and the fault cannot be resolved by faulting the page in via
fault_in_user_writeable() the function returns with -EFAULT and leaves
the rtmutex and pi_state owner state inconsistent.

A subsequent futex_unlock_pi() operates on the inconsistent pi_state and
releases the rtmutex despite not owning it which can corrupt the RB tree of
the rtmutex and cause a subsequent kernel stack use after free.

It was suggested to loop forever in fixup_pi_state_owner() if the fault
cannot be resolved, but that results in runaway tasks which is especially
undesired when the problem happens due to a programming error and not due
to malice.

As the user space value cannot be fixed up, the proper solution is to make
the rtmutex and the pi_state consistent so both have the same owner. This
leaves the user space value out of sync. Any subsequent operation on the
futex will fail because the 10th rule of PI futexes (pi_state owner and
user space value are consistent) has been violated.

As a consequence this removes the inept attempts of 'fixing' the situation
in case that the current task owns the rtmutex when returning with an
unresolvable fault by unlocking the rtmutex which left pi_state::owner and
rtmutex::owner out of sync in a different and only slightly less dangerous
way.

Fixes: 1b7558e457ed ("futexes: fix fault handling in futex_lock_pi")
Reported-by: gzobqq@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
diff --git a/kernel/futex.c b/kernel/futex.c
index e04bb97..224adcd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1034,7 +1034,8 @@
  *	FUTEX_OWNER_DIED bit. See [4]
  *
  * [10] There is no transient state which leaves owner and user space
- *	TID out of sync.
+ *	TID out of sync. Except one error case where the kernel is denied
+ *	write access to the user address, see fixup_pi_state_owner().
  *
  *
  * Serialization and lifetime rules:
@@ -2596,6 +2597,24 @@
 	if (!err)
 		goto retry;
 
+	/*
+	 * fault_in_user_writeable() failed so user state is immutable. At
+	 * best we can make the kernel state consistent but user state will
+	 * be most likely hosed and any subsequent unlock operation will be
+	 * rejected due to PI futex rule [10].
+	 *
+	 * Ensure that the rtmutex owner is also the pi_state owner despite
+	 * the user space value claiming something different. There is no
+	 * point in unlocking the rtmutex if current is the owner as it
+	 * would need to wait until the next waiter has taken the rtmutex
+	 * to guarantee consistent state. Keep it simple. Userspace asked
+	 * for this wreckaged state.
+	 *
+	 * The rtmutex has an owner - either current or some other
+	 * task. See the EAGAIN loop above.
+	 */
+	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
 	return err;
 }
 
@@ -2885,7 +2904,6 @@
 			 ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	struct futex_pi_state *pi_state = NULL;
 	struct task_struct *exiting = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct futex_hash_bucket *hb;
@@ -3028,23 +3046,9 @@
 	if (res)
 		ret = (res < 0) ? res : 0;
 
-	/*
-	 * If fixup_owner() faulted and was unable to handle the fault, unlock
-	 * it and return the fault to userspace.
-	 */
-	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
-		pi_state = q.pi_state;
-		get_pi_state(pi_state);
-	}
-
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-	if (pi_state) {
-		rt_mutex_futex_unlock(&pi_state->pi_mutex);
-		put_pi_state(pi_state);
-	}
-
 	goto out_put_key;
 
 out_unlock_put_key:
@@ -3310,7 +3314,6 @@
 				 u32 __user *uaddr2)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	struct futex_pi_state *pi_state = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct futex_hash_bucket *hb;
 	union futex_key key2 = FUTEX_KEY_INIT;
@@ -3395,10 +3398,6 @@
 		if (q.pi_state && (q.pi_state->owner != current)) {
 			spin_lock(q.lock_ptr);
 			ret = fixup_pi_state_owner(uaddr2, &q, current);
-			if (ret < 0 && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-				pi_state = q.pi_state;
-				get_pi_state(pi_state);
-			}
 			/*
 			 * Drop the reference to the pi state which
 			 * the requeue_pi() code acquired for us.
@@ -3440,25 +3439,10 @@
 		if (res)
 			ret = (res < 0) ? res : 0;
 
-		/*
-		 * If fixup_pi_state_owner() faulted and was unable to handle
-		 * the fault, unlock the rt_mutex and return the fault to
-		 * userspace.
-		 */
-		if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
-			pi_state = q.pi_state;
-			get_pi_state(pi_state);
-		}
-
 		/* Unqueue and drop the lock. */
 		unqueue_me_pi(&q);
 	}
 
-	if (pi_state) {
-		rt_mutex_futex_unlock(&pi_state->pi_mutex);
-		put_pi_state(pi_state);
-	}
-
 	if (ret == -EINTR) {
 		/*
 		 * We've already been requeued, but cannot restart by calling