ANDROID: userfaultfd: allow SPF for UFFD_FEATURE_SIGBUS on private+anon

Currently we bail out of speculative page fault when we detect that the
fault address is in a userfaultfd registered vma. However, if userfaultfd
is being used with UFFD_FEATURE_SIGBUS feature, then handle_userfault()
doesn't do much and is easiest to handle with SPF. This patch lets
MISSING userfaultfs on private anonymous mappings be allowed with SPF if
UFFD_FEATURE_SIGBUS is used.

With this patch we get >99% success rate for userfaults caused during
userfaultfd GC's compaction phase. This translates into eliminating
uninterruptible sleep time in do_page_fault() due to userfaults.

Bug: 320478828
Bug: 328787046
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Change-Id: Ic7fde0fde03602b35179bc0cf891ddbbc434190f
(cherry picked from commit 582c6d188ec138d8ed9c6ef235bf5698d80d7d6b)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2089266..6ecedf4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -71,6 +71,7 @@
 	bool mmap_changing;
 	/* mm with one ore more vmas attached to this userfaultfd_ctx */
 	struct mm_struct *mm;
+	struct rcu_head rcu_head;
 };
 
 struct userfaultfd_fork_ctx {
@@ -156,6 +157,13 @@
 	refcount_inc(&ctx->refcount);
 }
 
+static void __free_userfaultfd_ctx(struct rcu_head *head)
+{
+	struct userfaultfd_ctx *ctx = container_of(head, struct userfaultfd_ctx,
+						   rcu_head);
+	kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+}
+
 /**
  * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
  * context.
@@ -176,7 +184,7 @@
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 		mmdrop(ctx->mm);
-		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+		call_rcu(&ctx->rcu_head, __free_userfaultfd_ctx);
 	}
 }
 
@@ -350,6 +358,24 @@
 	return TASK_UNINTERRUPTIBLE;
 }
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+bool userfaultfd_using_sigbus(struct vm_area_struct *vma)
+{
+	struct userfaultfd_ctx *ctx;
+	bool ret;
+
+	/*
+	 * Do it inside RCU section to ensure that the ctx doesn't
+	 * disappear under us.
+	 */
+	rcu_read_lock();
+	ctx = rcu_dereference(vma->vm_userfaultfd_ctx.ctx);
+	ret = ctx && (ctx->features & UFFD_FEATURE_SIGBUS);
+	rcu_read_unlock();
+	return ret;
+}
+#endif
+
 /*
  * The locking rules involved in returning VM_FAULT_RETRY depending on
  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -394,7 +420,8 @@
 	 */
 	mmap_assert_locked(mm);
 
-	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
+	ctx = rcu_dereference_protected(vmf->vma->vm_userfaultfd_ctx.ctx,
+					lockdep_is_held(&mm->mmap_lock));
 	if (!ctx)
 		goto out;
 
@@ -611,8 +638,10 @@
 		/* the various vma->vm_userfaultfd_ctx still points to it */
 		mmap_write_lock(mm);
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
-				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+			if (rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) ==
+			    release_new_ctx) {
+				rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx,
+						   NULL);
 				vma->vm_flags &= ~__VM_UFFD_FLAGS;
 			}
 		mmap_write_unlock(mm);
@@ -642,10 +671,13 @@
 	struct userfaultfd_ctx *ctx = NULL, *octx;
 	struct userfaultfd_fork_ctx *fctx;
 
-	octx = vma->vm_userfaultfd_ctx.ctx;
+	octx = rcu_dereference_protected(
+			vma->vm_userfaultfd_ctx.ctx,
+			lockdep_is_held(&vma->vm_mm->mmap_lock));
+
 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
 		vm_write_begin(vma);
-		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
 		WRITE_ONCE(vma->vm_flags,
 			   vma->vm_flags & ~__VM_UFFD_FLAGS);
 		vm_write_end(vma);
@@ -684,7 +716,7 @@
 		list_add_tail(&fctx->list, fcs);
 	}
 
-	vma->vm_userfaultfd_ctx.ctx = ctx;
+	rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx);
 	return 0;
 }
 
@@ -717,7 +749,8 @@
 {
 	struct userfaultfd_ctx *ctx;
 
-	ctx = vma->vm_userfaultfd_ctx.ctx;
+	ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+					lockdep_is_held(&vma->vm_mm->mmap_lock));
 
 	if (!ctx)
 		return;
@@ -728,7 +761,7 @@
 		WRITE_ONCE(ctx->mmap_changing, true);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
-		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
 		vma->vm_flags &= ~__VM_UFFD_FLAGS;
 	}
 }
@@ -765,7 +798,8 @@
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue ewq;
 
-	ctx = vma->vm_userfaultfd_ctx.ctx;
+	ctx = rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+					lockdep_is_held(&mm->mmap_lock));
 	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
 		return true;
 
@@ -803,7 +837,9 @@
 {
 	for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
 		struct userfaultfd_unmap_ctx *unmap_ctx;
-		struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+		struct userfaultfd_ctx *ctx =
+			rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+						  lockdep_is_held(&vma->vm_mm->mmap_lock));
 
 		if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
 		    has_unmap_ctx(ctx, unmaps, start, end))
@@ -868,10 +904,13 @@
 	mmap_write_lock(mm);
 	prev = NULL;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		struct userfaultfd_ctx *cur_uffd_ctx =
+				rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+							  lockdep_is_held(&mm->mmap_lock));
 		cond_resched();
-		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+		BUG_ON(!!cur_uffd_ctx ^
 		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
-		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+		if (cur_uffd_ctx != ctx) {
 			prev = vma;
 			continue;
 		}
@@ -887,7 +926,7 @@
 			prev = vma;
 		vm_write_begin(vma);
 		WRITE_ONCE(vma->vm_flags, new_flags);
-		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
 		vm_write_end(vma);
 	}
 	mmap_write_unlock(mm);
@@ -1350,9 +1389,12 @@
 	found = false;
 	basic_ioctls = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+		struct userfaultfd_ctx *cur_uffd_ctx =
+				rcu_dereference_protected(cur->vm_userfaultfd_ctx.ctx,
+							  lockdep_is_held(&mm->mmap_lock));
 		cond_resched();
 
-		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+		BUG_ON(!!cur_uffd_ctx ^
 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/* check not compatible vmas */
@@ -1395,8 +1437,7 @@
 		 * wouldn't know which one to deliver the userfaults to.
 		 */
 		ret = -EBUSY;
-		if (cur->vm_userfaultfd_ctx.ctx &&
-		    cur->vm_userfaultfd_ctx.ctx != ctx)
+		if (cur_uffd_ctx && cur_uffd_ctx != ctx)
 			goto out_unlock;
 
 		/*
@@ -1414,18 +1455,20 @@
 
 	ret = 0;
 	do {
+		struct userfaultfd_ctx *cur_uffd_ctx =
+				rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+							  lockdep_is_held(&mm->mmap_lock));
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vm_flags));
-		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
-		       vma->vm_userfaultfd_ctx.ctx != ctx);
+		BUG_ON(cur_uffd_ctx && cur_uffd_ctx != ctx);
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
 		 * userfaultfd and with the right tracking mode too.
 		 */
-		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+		if (cur_uffd_ctx == ctx &&
 		    (vma->vm_flags & vm_flags) == vm_flags)
 			goto skip;
 
@@ -1461,7 +1504,7 @@
 		 */
 		vm_write_begin(vma);
 		WRITE_ONCE(vma->vm_flags, new_flags);
-		vma->vm_userfaultfd_ctx.ctx = ctx;
+		rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, ctx);
 		vm_write_end(vma);
 
 		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1561,7 +1604,7 @@
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
-		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+		BUG_ON(!!rcu_access_pointer(cur->vm_userfaultfd_ctx.ctx) ^
 		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
 
 		/*
@@ -1583,6 +1626,9 @@
 
 	ret = 0;
 	do {
+		struct userfaultfd_ctx *cur_uffd_ctx =
+				rcu_dereference_protected(vma->vm_userfaultfd_ctx.ctx,
+							  lockdep_is_held(&mm->mmap_lock));
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
@@ -1591,7 +1637,7 @@
 		 * Nothing to do: this vma is already registered into this
 		 * userfaultfd and with the right tracking mode too.
 		 */
-		if (!vma->vm_userfaultfd_ctx.ctx)
+		if (!cur_uffd_ctx)
 			goto skip;
 
 		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
@@ -1610,7 +1656,7 @@
 			struct userfaultfd_wake_range range;
 			range.start = start;
 			range.len = vma_end - start;
-			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+			wake_userfault(cur_uffd_ctx, &range);
 		}
 
 		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
@@ -1640,7 +1686,7 @@
 		 */
 		vm_write_begin(vma);
 		WRITE_ONCE(vma->vm_flags, new_flags);
-		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		rcu_assign_pointer(vma->vm_userfaultfd_ctx.ctx, NULL);
 		vm_write_end(vma);
 
 	skip:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 80126b5..2648ec4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -297,7 +297,7 @@
 #ifdef CONFIG_USERFAULTFD
 #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
 struct vm_userfaultfd_ctx {
-	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_ctx __rcu *ctx;
 };
 #else /* CONFIG_USERFAULTFD */
 #define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 331d2cc..43902f3 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,9 @@
 extern int sysctl_unprivileged_userfaultfd;
 
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+extern bool userfaultfd_using_sigbus(struct vm_area_struct *vma);
+#endif
 
 /*
  * The mode of operation for __mcopy_atomic and its helpers.
@@ -75,7 +78,7 @@
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
 {
-	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+	return rcu_access_pointer(vma->vm_userfaultfd_ctx.ctx) == vm_ctx.ctx;
 }
 
 /*
@@ -154,6 +157,13 @@
 	return VM_FAULT_SIGBUS;
 }
 
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+static inline bool userfaultfd_using_sigbus(struct vm_area_struct *vma)
+{
+	return false;
+}
+#endif
+
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
 {
diff --git a/mm/memory.c b/mm/memory.c
index ea4ebb9..7c09fb6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5006,6 +5006,7 @@
 	pud_t pudval;
 	int seq;
 	vm_fault_t ret;
+	bool uffd_missing_sigbus = false;
 
 	/* Clear flags that may lead to release the mmap_sem to retry */
 	flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
@@ -5018,20 +5019,31 @@
 		return VM_FAULT_RETRY;
 	}
 
-	if (!vmf_allows_speculation(&vmf))
-		return VM_FAULT_RETRY;
-
 	vmf.vma_flags = READ_ONCE(vmf.vma->vm_flags);
 	vmf.vma_page_prot = READ_ONCE(vmf.vma->vm_page_prot);
 
 #ifdef CONFIG_USERFAULTFD
-	/* Can't call userland page fault handler in the speculative path */
+	/*
+	 * Only support SPF for SIGBUS+MISSING userfaults in private anonymous
+	 * VMAs. Rest all should be retried with mmap_lock.
+	 */
 	if (unlikely(vmf.vma_flags & __VM_UFFD_FLAGS)) {
-		trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
-		return VM_FAULT_RETRY;
+		uffd_missing_sigbus = vma_is_anonymous(vmf.vma) &&
+					(vmf.vma_flags & VM_UFFD_MISSING) &&
+					userfaultfd_using_sigbus(vmf.vma);
+		if (!uffd_missing_sigbus) {
+			trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
+			return VM_FAULT_RETRY;
+		}
+		/* Not having anon_vma implies that the PTE is missing */
+		if (!vmf.vma->anon_vma)
+			return VM_FAULT_SIGBUS;
 	}
 #endif
 
+	if (!vmf_allows_speculation(&vmf))
+		return VM_FAULT_RETRY;
+
 	if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) {
 		/*
 		 * This could be detected by the check address against VMA's
@@ -5149,6 +5161,9 @@
 
 	local_irq_enable();
 
+	if (!vmf.pte && uffd_missing_sigbus)
+		return VM_FAULT_SIGBUS;
+
 	/*
 	 * We need to re-validate the VMA after checking the bounds, otherwise
 	 * we might have a false positive on the bounds.
@@ -5184,7 +5199,12 @@
 out_walk:
 	trace_spf_vma_notsup(_RET_IP_, vmf.vma, address);
 	local_irq_enable();
-	return VM_FAULT_RETRY;
+	/*
+	 * Failing page-table walk is similar to page-missing so give an
+	 * opportunity to SIGBUS+MISSING userfault to handle it before retrying
+	 * with mmap_lock
+	 */
+	return uffd_missing_sigbus ? VM_FAULT_SIGBUS : VM_FAULT_RETRY;
 
 out_segv:
 	trace_spf_vma_access(_RET_IP_, vmf.vma, address);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fa707e5..8ab1237 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -42,7 +42,7 @@
 	 * enforce the VM_MAYWRITE check done at uffd registration
 	 * time.
 	 */
-	if (!dst_vma->vm_userfaultfd_ctx.ctx)
+	if (!rcu_access_pointer(dst_vma->vm_userfaultfd_ctx.ctx))
 		return NULL;
 
 	return dst_vma;