public inbox for [email protected]
 help / color / mirror / Atom feed
From: Nadav Amit <[email protected]>
To: [email protected]
Cc: Nadav Amit <[email protected]>, Jens Axboe <[email protected]>,
	Andrea Arcangeli <[email protected]>,
	Peter Xu <[email protected]>,
	Alexander Viro <[email protected]>,
	[email protected], [email protected],
	[email protected]
Subject: [RFC PATCH 11/13] fs/userfaultfd: complete write asynchronously
Date: Sat, 28 Nov 2020 16:45:46 -0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Nadav Amit <[email protected]>

Userfaultfd writes can now be used for copy/zeroing. When using iouring
with userfaultfd, performing the copying/zeroing on the faulting thread
instead of the handler/iouring thread has several advantages:

(1) The data of the faulting thread will be available on the local
caches, which would make subsequent memory accesses faster.

(2) find_vma() would be able to use the vma-cache, which cannot be done
from a different process or io-uring kernel thread.

(3) The page is more likely to be allocated on the correct NUMA node.

To do so, userfaultfd work queue structs are extended to hold the
information that is required for the faulting thread to copy/zero. The
handler wakes one of the faulting threads to perform the copy/zero and
that thread wakes the other threads after the zero/copy is done.

Cc: Jens Axboe <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Peter Xu <[email protected]>
Cc: Alexander Viro <[email protected]>
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Signed-off-by: Nadav Amit <[email protected]>
---
 fs/userfaultfd.c | 241 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 178 insertions(+), 63 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index eae6ac303951..5c22170544e3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -105,58 +105,71 @@ struct userfaultfd_unmap_ctx {
 	struct list_head list;
 };
 
+struct userfaultfd_wake_info {
+	__u64 mode;
+	struct kiocb *iocb_callback;
+	struct iov_iter from;
+	unsigned long start;
+	unsigned long len;
+	bool copied;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_entry_t wq;
 	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wake_info wake_info;
 	bool waken;
 };
 
-struct userfaultfd_wake_range {
-	unsigned long start;
-	unsigned long len;
-};
+
 
 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
 				     int wake_flags, void *key)
 {
-	struct userfaultfd_wake_range *range = key;
-	int ret;
+	struct userfaultfd_wake_info *wake_info = key;
 	struct userfaultfd_wait_queue *uwq;
 	unsigned long start, len;
+	int ret = 0;
 
 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
-	ret = 0;
 	/* len == 0 means wake all */
-	start = range->start;
-	len = range->len;
+	start = wake_info->start;
+	len = wake_info->len;
 	if (len && (start > uwq->msg.arg.pagefault.address ||
 		    start + len <= uwq->msg.arg.pagefault.address))
 		goto out;
 
-	smp_store_mb(uwq->waken, true);
+	uwq->wake_info = *wake_info;
+
+	if (wake_info->iocb_callback)
+		wake_info->copied = true;
+
+	/* Ensure uwq->wake_info is visible to handle_userfault() before waken */
+	smp_wmb();
+
+	WRITE_ONCE(uwq->waken, true);
 
 	/*
 	 * The Program-Order guarantees provided by the scheduler
 	 * ensure uwq->waken is visible before the task is woken.
 	 */
 	ret = wake_up_state(wq->private, mode);
-	if (ret) {
-		/*
-		 * Wake only once, autoremove behavior.
-		 *
-		 * After the effect of list_del_init is visible to the other
-		 * CPUs, the waitqueue may disappear from under us, see the
-		 * !list_empty_careful() in handle_userfault().
-		 *
-		 * try_to_wake_up() has an implicit smp_mb(), and the
-		 * wq->private is read before calling the extern function
-		 * "wake_up_state" (which in turns calls try_to_wake_up).
-		 */
-		list_del_init(&wq->entry);
-	}
+
+	/*
+	 * Wake only once, autoremove behavior.
+	 *
+	 * After the effect of list_del_init is visible to the other
+	 * CPUs, the waitqueue may disappear from under us, see the
+	 * !list_empty_careful() in handle_userfault().
+	 *
+	 * try_to_wake_up() has an implicit smp_mb(), and the
+	 * wq->private is read before calling the extern function
+	 * "wake_up_state" (which in turns calls try_to_wake_up).
+	 */
+	list_del_init(&wq->entry);
 out:
-	return ret;
+	return ret || wake_info->copied;
 }
 
 /**
@@ -384,6 +397,9 @@ static bool userfaultfd_get_async_complete_locked(struct userfaultfd_ctx *ctx,
 	return true;
 }
 
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+					   struct userfaultfd_wake_info *wake_info);
+
 static bool userfaultfd_get_async_complete(struct userfaultfd_ctx *ctx,
 				struct kiocb **iocb, struct iov_iter *iter)
 {
@@ -414,6 +430,43 @@ static void userfaultfd_copy_async_msg(struct kiocb *iocb,
 	iter->kvec = NULL;
 }
 
+static void userfaultfd_complete_write(struct userfaultfd_ctx *ctx,
+					       struct userfaultfd_wait_queue *uwq)
+{
+	struct kiocb *iocb = uwq->wake_info.iocb_callback;
+	const struct kvec *kvec = uwq->wake_info.from.kvec;
+	bool zeropage = uwq->wake_info.mode & UFFDIO_WRITE_MODE_ZEROPAGE;
+	u64 mode = uwq->wake_info.mode &
+		(UFFDIO_WRITE_MODE_DONTWAKE | UFFDIO_WRITE_MODE_WP);
+	int r;
+
+	if (zeropage)
+		r = mfill_zeropage(ctx->mm, uwq->wake_info.start,
+			&uwq->wake_info.from, &ctx->mmap_changing);
+	else
+		r = mcopy_atomic(ctx->mm, uwq->wake_info.start,
+			&uwq->wake_info.from, &ctx->mmap_changing, mode);
+
+	/*
+	 * If we failed, do not wake the others, but if there was a partial
+	 * write, still wake others.
+	 */
+	if (r < 0)
+		goto out;
+
+	/* The callees should not do any copying */
+	uwq->wake_info.iocb_callback = NULL;
+	uwq->wake_info.from.kvec = NULL;
+	wake_userfault(ctx, &uwq->wake_info);
+out:
+	/*
+	 * Complete the operation only after waking the other threads as done
+	 * in the synchronous case.
+	 */
+	iocb->ki_complete(iocb, r, 0);
+	kfree(kvec);
+}
+
 /*
  * The locking rules involved in returning VM_FAULT_RETRY depending on
  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -548,6 +601,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 			ctx->features);
 	uwq.ctx = ctx;
 	uwq.waken = false;
+	uwq.wake_info.iocb_callback = NULL;
 
 	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
 
@@ -569,7 +623,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	 */
 	spin_lock(&wqh->lock);
 
-	__add_wait_queue(wqh, &uwq.wq);
+	/* Exclusive on the fault_wqh, not on the fault_pending_wqh */
+	if (async)
+		__add_wait_queue_exclusive(wqh, &uwq.wq);
+	else
+		__add_wait_queue(wqh, &uwq.wq);
 
 	/* Ensure it is queued before userspace is informed. */
 	smp_wmb();
@@ -612,6 +670,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 				cpu_relax();
 				cond_resched();
 			}
+			/*
+			 * Ensure writes from userfaultfd_wake_function into uwq
+			 * are visible.
+			 */
+			smp_rmb();
 		} else
 			schedule();
 	}
@@ -650,6 +713,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 		local_irq_enable();
 	}
 
+	/* Complete copy/zero after the entry is no longer on the queue. */
+	if (uwq.wake_info.iocb_callback)
+		userfaultfd_complete_write(ctx, &uwq);
+
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
 	 * already released.
@@ -1004,7 +1071,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	struct mm_struct *mm = ctx->mm;
 	struct vm_area_struct *vma, *prev;
 	/* len == 0 means wake all */
-	struct userfaultfd_wake_range range = { .len = 0, };
+	struct userfaultfd_wake_info wake_info = { 0 };
 	unsigned long new_flags;
 
 	WRITE_ONCE(ctx->released, true);
@@ -1052,8 +1119,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	 * the fault_*wqh.
 	 */
 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
-	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
-	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &wake_info);
+	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 0, &wake_info);
 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 
 	userfaultfd_cancel_async_reads(ctx);
@@ -1294,7 +1361,7 @@ static ssize_t userfaultfd_ctx_read(struct kiocb *iocb,
 			 * anyway.
 			 */
 			list_del(&uwq->wq.entry);
-			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+			add_wait_queue_exclusive(&ctx->fault_wqh, &uwq->wq);
 
 			write_seqcount_end(&ctx->refile_seq);
 
@@ -1459,20 +1526,20 @@ static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
 }
 
 static void __wake_userfault(struct userfaultfd_ctx *ctx,
-			     struct userfaultfd_wake_range *range)
+			     struct userfaultfd_wake_info *wake_info)
 {
 	spin_lock_irq(&ctx->fault_pending_wqh.lock);
 	/* wake all in the range and autoremove */
 	if (waitqueue_active(&ctx->fault_pending_wqh))
 		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
-				     range);
+				     wake_info);
 	if (waitqueue_active(&ctx->fault_wqh))
-		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
+		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 0, wake_info);
 	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
 }
 
 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
-					   struct userfaultfd_wake_range *range)
+					   struct userfaultfd_wake_info *wake_info)
 {
 	unsigned seq;
 	bool need_wakeup;
@@ -1499,7 +1566,7 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
 		cond_resched();
 	} while (read_seqcount_retry(&ctx->refile_seq, seq));
 	if (need_wakeup)
-		__wake_userfault(ctx, range);
+		__wake_userfault(ctx, wake_info);
 }
 
 static __always_inline int validate_range(struct mm_struct *mm,
@@ -1524,14 +1591,57 @@ static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
+static int userfaultfd_remote_mcopy(struct kiocb *iocb, __u64 dst,
+				    struct iov_iter *from, __u64 mode)
+{
+	struct file *file = iocb->ki_filp;
+	struct userfaultfd_ctx *ctx = file->private_data;
+	struct userfaultfd_wake_info wake_info = {
+		.iocb_callback = iocb,
+		.mode = mode,
+		.start = dst,
+		.len = iov_iter_count(from),
+		.copied = false,
+	};
+	int ret = -EAGAIN;
+
+	if (mode & UFFDIO_COPY_MODE_DONTWAKE)
+		goto out;
+
+	if (!iov_iter_is_bvec(from) && !iov_iter_is_kvec(from))
+		goto out;
+
+	/*
+	 * Check without a lock. If we are mistaken, the mcopy would be
+	 * performed locally.
+	 */
+	if (!waitqueue_active(&ctx->fault_wqh))
+		goto out;
+
+	dup_iter(&wake_info.from, from, GFP_KERNEL);
+
+	/* wake one in the range and autoremove */
+	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &wake_info);
+
+	if (!wake_info.copied) {
+		kfree(wake_info.from.kvec);
+		goto out;
+	}
+
+	ret = -EIOCBQUEUED;
+out:
+	return ret;
+}
+
 ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct userfaultfd_wake_range range;
+	struct userfaultfd_wake_info wake_info = { 0 };
 	struct userfaultfd_ctx *ctx = file->private_data;
 	size_t len = iov_iter_count(from);
 	__u64 dst = iocb->ki_pos & PAGE_MASK;
 	unsigned long mode = iocb->ki_pos & ~PAGE_MASK;
+	int no_wait = file->f_flags & O_NONBLOCK;
 	bool zeropage;
 	__s64 ret;
 
@@ -1563,25 +1673,30 @@ ssize_t userfaultfd_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
-	if (mmget_not_zero(ctx->mm)) {
+	if (!mmget_not_zero(ctx->mm))
+		return -ESRCH;
+
+	ret = -EAGAIN;
+	if (no_wait && !is_sync_kiocb(iocb))
+		ret = userfaultfd_remote_mcopy(iocb, dst, from, mode);
+	if (ret == -EAGAIN) {
 		if (zeropage)
 			ret = mfill_zeropage(ctx->mm, dst, from,
 					     &ctx->mmap_changing);
 		else
 			ret = mcopy_atomic(ctx->mm, dst, from,
 					   &ctx->mmap_changing, mode);
-		mmput(ctx->mm);
-	} else {
-		return -ESRCH;
 	}
+	mmput(ctx->mm);
+
 	if (ret < 0)
 		goto out;
 
 	/* len == 0 would wake all */
-	range.len = ret;
+	wake_info.len = ret;
 	if (!(mode & UFFDIO_COPY_MODE_DONTWAKE)) {
-		range.start = dst;
-		wake_userfault(ctx, &range);
+		wake_info.start = dst;
+		wake_userfault(ctx, &wake_info);
 	}
 out:
 	return ret;
@@ -1916,7 +2031,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			 * permanently and it avoids userland to call
 			 * UFFDIO_WAKE explicitly.
 			 */
-			struct userfaultfd_wake_range range;
+			struct userfaultfd_wake_info range;
 			range.start = start;
 			range.len = vma_end - start;
 			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
@@ -1971,7 +2086,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
 {
 	int ret;
 	struct uffdio_range uffdio_wake;
-	struct userfaultfd_wake_range range;
+	struct userfaultfd_wake_info wake_info = { 0 };
 	const void __user *buf = (void __user *)arg;
 
 	ret = -EFAULT;
@@ -1982,16 +2097,16 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
 	if (ret)
 		goto out;
 
-	range.start = uffdio_wake.start;
-	range.len = uffdio_wake.len;
+	wake_info.start = uffdio_wake.start;
+	wake_info.len = uffdio_wake.len;
 
 	/*
 	 * len == 0 means wake all and we don't want to wake all here,
 	 * so check it again to be sure.
 	 */
-	VM_BUG_ON(!range.len);
+	VM_BUG_ON(!wake_info.len);
 
-	wake_userfault(ctx, &range);
+	wake_userfault(ctx, &wake_info);
 	ret = 0;
 
 out:
@@ -2004,7 +2119,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	__s64 ret;
 	struct uffdio_copy uffdio_copy;
 	struct uffdio_copy __user *user_uffdio_copy;
-	struct userfaultfd_wake_range range;
+	struct userfaultfd_wake_info wake_info = { 0 };
 	struct iov_iter iter;
 	struct iovec iov;
 
@@ -2052,12 +2167,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 		goto out;
 	BUG_ON(!ret);
 	/* len == 0 would wake all */
-	range.len = ret;
+	wake_info.len = ret;
 	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
-		range.start = uffdio_copy.dst;
-		wake_userfault(ctx, &range);
+		wake_info.start = uffdio_copy.dst;
+		wake_userfault(ctx, &wake_info);
 	}
-	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+	ret = wake_info.len == uffdio_copy.len ? 0 : -EAGAIN;
 out:
 	return ret;
 }
@@ -2068,7 +2183,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 	__s64 ret;
 	struct uffdio_zeropage uffdio_zeropage;
 	struct uffdio_zeropage __user *user_uffdio_zeropage;
-	struct userfaultfd_wake_range range;
+	struct userfaultfd_wake_info wake_info = { 0 };
 	struct iov_iter iter;
 	struct iovec iov;
 
@@ -2108,12 +2223,12 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 	/* len == 0 would wake all */
 	BUG_ON(!ret);
-	range.len = ret;
+	wake_info.len = ret;
 	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
-		range.start = uffdio_zeropage.range.start;
-		wake_userfault(ctx, &range);
+		wake_info.start = uffdio_zeropage.range.start;
+		wake_userfault(ctx, &wake_info);
 	}
-	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+	ret = wake_info.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
 out:
 	return ret;
 }
@@ -2124,7 +2239,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 	int ret;
 	struct uffdio_writeprotect uffdio_wp;
 	struct uffdio_writeprotect __user *user_uffdio_wp;
-	struct userfaultfd_wake_range range;
+	struct userfaultfd_wake_info wake_info = { 0 };
 	bool mode_wp, mode_dontwake;
 
 	if (READ_ONCE(ctx->mmap_changing))
@@ -2158,9 +2273,9 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 		return ret;
 
 	if (!mode_wp && !mode_dontwake) {
-		range.start = uffdio_wp.range.start;
-		range.len = uffdio_wp.range.len;
-		wake_userfault(ctx, &range);
+		wake_info.start = uffdio_wp.range.start;
+		wake_info.len = uffdio_wp.range.len;
+		wake_userfault(ctx, &wake_info);
 	}
 	return ret;
 }
-- 
2.25.1


  parent reply	other threads:[~2020-11-29  0:51 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-29  0:45 [RFC PATCH 00/13] fs/userfaultfd: support iouring and polling Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 01/13] fs/userfaultfd: fix wrong error code on WP & !VM_MAYWRITE Nadav Amit
2020-12-01 21:22   ` Mike Kravetz
2020-12-21 19:01     ` Peter Xu
2020-11-29  0:45 ` [RFC PATCH 02/13] fs/userfaultfd: fix wrong file usage with iouring Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 03/13] selftests/vm/userfaultfd: wake after copy failure Nadav Amit
2020-12-21 19:28   ` Peter Xu
2020-12-21 19:51     ` Nadav Amit
2020-12-21 20:52       ` Peter Xu
2020-12-21 20:54         ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 04/13] fs/userfaultfd: simplify locks in userfaultfd_ctx_read Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 05/13] fs/userfaultfd: introduce UFFD_FEATURE_POLL Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 06/13] iov_iter: support atomic copy_page_from_iter_iovec() Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 07/13] fs/userfaultfd: support read_iter to use io_uring Nadav Amit
2020-11-30 18:20   ` Jens Axboe
2020-11-30 19:23     ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 08/13] fs/userfaultfd: complete reads asynchronously Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 09/13] fs/userfaultfd: use iov_iter for copy/zero Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 10/13] fs/userfaultfd: add write_iter() interface Nadav Amit
2020-11-29  0:45 ` Nadav Amit [this message]
2020-12-02  7:12   ` [RFC PATCH 11/13] fs/userfaultfd: complete write asynchronously Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 12/13] fs/userfaultfd: kmem-cache for wait-queue objects Nadav Amit
2020-11-30 19:51   ` Nadav Amit
2020-11-29  0:45 ` [RFC PATCH 13/13] selftests/vm/userfaultfd: iouring and polling tests Nadav Amit

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox