From: Bernd Schubert <[email protected]>
To: Miklos Szeredi <[email protected]>, Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>,
[email protected]
Cc: [email protected], [email protected],
Joanne Koong <[email protected]>,
Josef Bacik <[email protected]>,
Amir Goldstein <[email protected]>,
Bernd Schubert <[email protected]>
Subject: [PATCH RFC v3 17/17] fuse: {uring} Pin the user buffer
Date: Sun, 01 Sep 2024 15:37:11 +0200 [thread overview]
Message-ID: <20240901-b4-fuse-uring-rfcv3-without-mmap-v3-17-9207f7391444@ddn.com> (raw)
In-Reply-To: <20240901-b4-fuse-uring-rfcv3-without-mmap-v3-0-9207f7391444@ddn.com>
This is to allow copying into the buffer from the application
without the need to copy in ring context (and with that,
the need that the ring task is active in kernel space).
Also absolutely needed for now to avoid this teardown issue
1525.905504] KASAN: null-ptr-deref in range [0x00000000000001a0-0x00000000000001a7]
[ 1525.910431] CPU: 15 PID: 183 Comm: kworker/15:1 Tainted: G O 6.10.0+ #48
[ 1525.916449] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 1525.922470] Workqueue: events io_fallback_req_func
[ 1525.925840] RIP: 0010:__lock_acquire+0x74/0x7b80
[ 1525.929010] Code: 89 bc 24 80 00 00 00 0f 85 1c 5f 00 00 83 3d 6e 80 b0 02 00 0f 84 1d 12 00 00 83 3d 65 c7 67 02 00 74 27 48 89 f8 48 c1 e8 03 <42> 80 3c 30 00 74 0d e8 50 44 42 00 48 8b bc 24 80 00 00 00 48 c7
[ 1525.942211] RSP: 0018:ffff88810b2af490 EFLAGS: 00010002
[ 1525.945672] RAX: 0000000000000034 RBX: 0000000000000000 RCX: 0000000000000001
[ 1525.950421] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00000000000001a0
[ 1525.955200] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000
[ 1525.959979] R10: dffffc0000000000 R11: fffffbfff07b1cbe R12: 0000000000000000
[ 1525.964252] R13: 0000000000000001 R14: dffffc0000000000 R15: 0000000000000001
[ 1525.968225] FS: 0000000000000000(0000) GS:ffff88875b200000(0000) knlGS:0000000000000000
[ 1525.973932] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1525.976694] CR2: 00005555b6a381f0 CR3: 000000012f5f1000 CR4: 00000000000006f0
[ 1525.980030] Call Trace:
[ 1525.981371] <TASK>
[ 1525.982567] ? __die_body+0x66/0xb0
[ 1525.984376] ? die_addr+0xc1/0x100
[ 1525.986111] ? exc_general_protection+0x1c6/0x330
[ 1525.988401] ? asm_exc_general_protection+0x22/0x30
[ 1525.990864] ? __lock_acquire+0x74/0x7b80
[ 1525.992901] ? mark_lock+0x9f/0x360
[ 1525.994635] ? __lock_acquire+0x1420/0x7b80
[ 1525.996629] ? attach_entity_load_avg+0x47d/0x550
[ 1525.998765] ? hlock_conflict+0x5a/0x1f0
[ 1526.000515] ? __bfs+0x2dc/0x5a0
[ 1526.001993] lock_acquire+0x1fb/0x3d0
[ 1526.004727] ? gup_fast_fallback+0x13f/0x1d80
[ 1526.006586] ? gup_fast_fallback+0x13f/0x1d80
[ 1526.008412] gup_fast_fallback+0x158/0x1d80
[ 1526.010170] ? gup_fast_fallback+0x13f/0x1d80
[ 1526.011999] ? __lock_acquire+0x2b07/0x7b80
[ 1526.013793] __iov_iter_get_pages_alloc+0x36e/0x980
[ 1526.015876] ? do_raw_spin_unlock+0x5a/0x8a0
[ 1526.017734] iov_iter_get_pages2+0x56/0x70
[ 1526.019491] fuse_copy_fill+0x48e/0x980 [fuse]
[ 1526.021400] fuse_copy_args+0x174/0x6a0 [fuse]
[ 1526.023199] fuse_uring_prepare_send+0x319/0x6c0 [fuse]
[ 1526.025178] fuse_uring_send_req_in_task+0x42/0x100 [fuse]
[ 1526.027163] io_fallback_req_func+0xb4/0x170
[ 1526.028737] ? process_scheduled_works+0x75b/0x1160
[ 1526.030445] process_scheduled_works+0x85c/0x1160
[ 1526.032073] worker_thread+0x8ba/0xce0
[ 1526.033388] kthread+0x23e/0x2b0
[ 1526.035404] ? pr_cont_work_flush+0x290/0x290
[ 1526.036958] ? kthread_blkcg+0xa0/0xa0
[ 1526.038321] ret_from_fork+0x30/0x60
[ 1526.039600] ? kthread_blkcg+0xa0/0xa0
[ 1526.040942] ret_from_fork_asm+0x11/0x20
[ 1526.042353] </TASK>
Signed-off-by: Bernd Schubert <[email protected]>
---
fs/fuse/dev.c | 9 +++
fs/fuse/dev_uring.c | 186 ++++++++++++++++++++++++++++++++------------------
fs/fuse/dev_uring_i.h | 15 ++--
fs/fuse/fuse_dev_i.h | 2 +
4 files changed, 143 insertions(+), 69 deletions(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9f0f2120b1fa..492bb95fde4e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -769,6 +769,15 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->pipebufs++;
cs->nr_segs++;
}
+ } else if (cs->is_uring) {
+ cs->pg = cs->ring.pages[cs->ring.page_idx++];
+ /*
+ * non stricly needed, just to avoid a uring exception in
+ * fuse_copy_finish
+ */
+ get_page(cs->pg);
+ cs->len = PAGE_SIZE;
+ cs->offset = 0;
} else {
size_t off;
err = iov_iter_get_pages2(cs->iter, &page, PAGE_SIZE, 1, &off);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index a65c5d08fce1..4cc0facaaae3 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -29,6 +29,9 @@
#include <linux/topology.h>
#include <linux/io_uring/cmd.h>
+#define FUSE_RING_HEADER_PG 0
+#define FUSE_RING_PAYLOAD_PG 1
+
struct fuse_uring_cmd_pdu {
struct fuse_ring_ent *ring_ent;
};
@@ -250,6 +253,21 @@ static void fuse_uring_stop_fuse_req_end(struct fuse_ring_ent *ent)
fuse_request_end(req);
}
+/*
+ * Copy from memmap.c, should be exported
+ */
+static void io_pages_free(struct page ***pages, int npages)
+{
+ struct page **page_array = *pages;
+
+ if (!page_array)
+ return;
+
+ unpin_user_pages(page_array, npages);
+ kvfree(page_array);
+ *pages = NULL;
+}
+
/*
* Release a request/entry on connection tear down
*/
@@ -275,6 +293,8 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent,
if (ent->fuse_req)
fuse_uring_stop_fuse_req_end(ent);
+ io_pages_free(&ent->user_pages, ent->nr_user_pages);
+
ent->state = FRRS_FREED;
}
@@ -417,6 +437,7 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
goto seterr;
}
+ /* FIXME copied from dev.c, check what 512 means */
if (oh->error <= -512 || oh->error > 0) {
err = -EINVAL;
goto seterr;
@@ -465,53 +486,41 @@ static int fuse_uring_out_header_has_err(struct fuse_out_header *oh,
static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
struct fuse_req *req,
- struct fuse_ring_ent *ent)
+ struct fuse_ring_ent *ent,
+ struct fuse_ring_req *rreq)
{
- struct fuse_ring_req __user *rreq = ent->rreq;
struct fuse_copy_state cs;
struct fuse_args *args = req->args;
struct iov_iter iter;
- int err;
- int res_arg_len;
+ int res_arg_len, err;
- err = copy_from_user(&res_arg_len, &rreq->in_out_arg_len,
- sizeof(res_arg_len));
- if (err)
- return err;
-
- err = import_ubuf(ITER_SOURCE, (void __user *)&rreq->in_out_arg,
- ent->max_arg_len, &iter);
- if (err)
- return err;
+ res_arg_len = rreq->in_out_arg_len;
fuse_copy_init(&cs, 0, &iter);
cs.is_uring = 1;
+ cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
cs.req = req;
- return fuse_copy_out_args(&cs, args, res_arg_len);
+ err = fuse_copy_out_args(&cs, args, res_arg_len);
+
+ return err;
}
- /*
- * Copy data from the req to the ring buffer
- */
+/*
+ * Copy data from the req to the ring buffer
+ */
static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
- struct fuse_ring_ent *ent)
+ struct fuse_ring_ent *ent,
+ struct fuse_ring_req *rreq)
{
- struct fuse_ring_req __user *rreq = ent->rreq;
struct fuse_copy_state cs;
struct fuse_args *args = req->args;
- int err, res;
+ int err;
struct iov_iter iter;
- err = import_ubuf(ITER_DEST, (void __user *)&rreq->in_out_arg,
- ent->max_arg_len, &iter);
- if (err) {
- pr_info("Import user buffer failed\n");
- return err;
- }
-
fuse_copy_init(&cs, 1, &iter);
cs.is_uring = 1;
+ cs.ring.pages = &ent->user_pages[FUSE_RING_PAYLOAD_PG];
cs.req = req;
err = fuse_copy_args(&cs, args->in_numargs, args->in_pages,
(struct fuse_arg *)args->in_args, 0);
@@ -520,10 +529,7 @@ static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
return err;
}
- BUILD_BUG_ON((sizeof(rreq->in_out_arg_len) != sizeof(cs.ring.offset)));
- res = copy_to_user(&rreq->in_out_arg_len, &cs.ring.offset,
- sizeof(rreq->in_out_arg_len));
- err = res > 0 ? -EFAULT : res;
+ rreq->in_out_arg_len = cs.ring.offset;
return err;
}
@@ -531,11 +537,11 @@ static int fuse_uring_copy_to_ring(struct fuse_ring *ring, struct fuse_req *req,
static int
fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
{
- struct fuse_ring_req *rreq = ring_ent->rreq;
+ struct fuse_ring_req *rreq = NULL;
struct fuse_ring_queue *queue = ring_ent->queue;
struct fuse_ring *ring = queue->ring;
struct fuse_req *req = ring_ent->fuse_req;
- int err = 0, res;
+ int err = 0;
if (WARN_ON(ring_ent->state != FRRS_FUSE_REQ)) {
pr_err("qid=%d tag=%d ring-req=%p buf_req=%p invalid state %d on send\n",
@@ -551,25 +557,27 @@ fuse_uring_prepare_send(struct fuse_ring_ent *ring_ent)
__func__, queue->qid, ring_ent->tag, ring_ent->state,
req->in.h.opcode, req->in.h.unique);
+ rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+
/* copy the request */
- err = fuse_uring_copy_to_ring(ring, req, ring_ent);
+ err = fuse_uring_copy_to_ring(ring, req, ring_ent, rreq);
if (unlikely(err)) {
pr_info("Copy to ring failed: %d\n", err);
goto err;
}
/* copy fuse_in_header */
- res = copy_to_user(&rreq->in, &req->in.h, sizeof(rreq->in));
- err = res > 0 ? -EFAULT : res;
- if (err)
- goto err;
+ rreq->in = req->in.h;
+ err = 0;
set_bit(FR_SENT, &req->flags);
- return 0;
-
+out:
+ if (rreq)
+ kunmap_local(rreq);
+ return err;
err:
fuse_uring_req_end(ring_ent, true, err);
- return err;
+ goto out;
}
/*
@@ -682,16 +690,13 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
{
struct fuse_ring *ring = ring_ent->queue->ring;
struct fuse_conn *fc = ring->fc;
- struct fuse_ring_req *rreq = ring_ent->rreq;
+ struct fuse_ring_req *rreq;
struct fuse_req *req = ring_ent->fuse_req;
ssize_t err = 0;
bool set_err = false;
- err = copy_from_user(&req->out.h, &rreq->out, sizeof(req->out.h));
- if (err) {
- req->out.h.error = err;
- goto out;
- }
+ rreq = kmap_local_page(ring_ent->user_pages[FUSE_RING_HEADER_PG]);
+ req->out.h = rreq->out;
err = fuse_uring_out_header_has_err(&req->out.h, req, fc);
if (err) {
@@ -701,7 +706,8 @@ static void fuse_uring_commit(struct fuse_ring_ent *ring_ent,
goto out;
}
- err = fuse_uring_copy_from_ring(ring, req, ring_ent);
+ err = fuse_uring_copy_from_ring(ring, req, ring_ent, rreq);
+ kunmap_local(rreq);
if (err)
set_err = true;
@@ -830,6 +836,46 @@ __must_hold(ring_ent->queue->lock)
return 0;
}
+/*
+ * Copy from memmap.c, should be exported there
+ */
+static struct page **io_pin_pages(unsigned long uaddr, unsigned long len,
+ int *npages)
+{
+ unsigned long start, end, nr_pages;
+ struct page **pages;
+ int ret;
+
+ end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+ nr_pages = end - start;
+ if (WARN_ON_ONCE(!nr_pages))
+ return ERR_PTR(-EINVAL);
+
+ pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+ pages);
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
+ *npages = nr_pages;
+ return pages;
+ }
+
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
+ /* if we did partial map, release any pages we did get */
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
+ }
+ kvfree(pages);
+ return ERR_PTR(ret);
+}
+
+
/* FUSE_URING_REQ_FETCH handler */
static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
struct io_uring_cmd *cmd, unsigned int issue_flags)
@@ -837,39 +883,48 @@ static int fuse_uring_fetch(struct fuse_ring_ent *ring_ent,
{
struct fuse_ring *ring = ring_ent->queue->ring;
struct fuse_ring_queue *queue = ring_ent->queue;
- int ret;
+ int err;
/* No other bit must be set here */
- ret = -EINVAL;
+ err = -EINVAL;
if (ring_ent->state != FRRS_INIT)
- goto err;
+ goto err_unlock;
/*
* FUSE_URING_REQ_FETCH is an initialization exception, needs
* state override
*/
ring_ent->state = FRRS_USERSPACE;
- ret = fuse_ring_ring_ent_unset_userspace(ring_ent);
- if (ret != 0) {
- pr_info_ratelimited(
- "qid=%d tag=%d register req state %d expected %d",
- queue->qid, ring_ent->tag, ring_ent->state,
- FRRS_INIT);
+ fuse_ring_ring_ent_unset_userspace(ring_ent);
+
+ err = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
+ if (err)
+ goto err_unlock;
+
+ spin_unlock(&queue->lock);
+
+ /* must not hold the queue->lock */
+ ring_ent->user_pages = io_pin_pages(ring_ent->user_buf,
+ ring_ent->user_buf_len,
+ &ring_ent->nr_user_pages);
+ if (IS_ERR(ring_ent->user_pages)) {
+ err = PTR_ERR(ring_ent->user_pages);
+ pr_info("qid=%d ent=%d pin-res=%d\n",
+ queue->qid, ring_ent->tag, err);
goto err;
}
- ret = _fuse_uring_fetch(ring_ent, cmd, issue_flags);
- if (ret)
- goto err;
-
/*
* The ring entry is registered now and needs to be handled
* for shutdown.
*/
atomic_inc(&ring->queue_refs);
-err:
+ return 0;
+
+err_unlock:
spin_unlock(&queue->lock);
- return ret;
+err:
+ return err;
}
/**
@@ -920,7 +975,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
if (unlikely(fc->aborted || queue->stopped))
goto err_unlock;
- ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
+ ring_ent->user_buf = cmd_req->buf_ptr;
+ ring_ent->user_buf_len = cmd_req->buf_len;
+
ring_ent->max_arg_len = cmd_req->buf_len -
offsetof(struct fuse_ring_req, in_out_arg);
ret = -EINVAL;
@@ -930,7 +987,6 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
goto err_unlock;
}
- ring_ent->rreq = (void __user *)cmd_req->buf_ptr;
ring_ent->max_arg_len = cmd_req->buf_len -
offsetof(struct fuse_ring_req, in_out_arg);
if (cmd_req->buf_len < ring->req_buf_sz) {
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index f1247ee57dc4..2e43b2e9bcf2 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -60,10 +60,17 @@ struct fuse_ring_ent {
/* fuse_req assigned to the ring entry */
struct fuse_req *fuse_req;
- /*
- * buffer provided by fuse server
- */
- struct fuse_ring_req __user *rreq;
+ /* buffer provided by fuse server */
+ unsigned long __user user_buf;
+
+ /* length of user_buf */
+ size_t user_buf_len;
+
+ /* mapped user_buf pages */
+ struct page **user_pages;
+
+ /* number of user pages */
+ int nr_user_pages;
/* struct fuse_ring_req::in_out_arg size*/
size_t max_arg_len;
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 0fbb4f28261c..63e0e5dcb9f4 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -32,6 +32,8 @@ struct fuse_copy_state {
struct {
/* overall offset with the user buffer */
unsigned int offset;
+ struct page **pages;
+ int page_idx;
} ring;
};
--
2.43.0
next prev parent reply other threads:[~2024-09-01 15:09 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-01 13:36 [PATCH RFC v3 00/17] fuse: fuse-over-io-uring Bernd Schubert
2024-09-01 13:36 ` [PATCH RFC v3 01/17] fuse: rename to fuse_dev_end_requests and make non-static Bernd Schubert
2024-09-01 13:36 ` [PATCH RFC v3 02/17] fuse: Move fuse_get_dev to header file Bernd Schubert
2024-09-01 13:36 ` [PATCH RFC v3 03/17] fuse: Move request bits Bernd Schubert
2024-09-01 13:36 ` [PATCH RFC v3 04/17] fuse: Add fuse-io-uring design documentation Bernd Schubert
2024-09-01 13:36 ` [PATCH RFC v3 05/17] fuse: Add a uring config ioctl Bernd Schubert
2024-09-04 0:43 ` Joanne Koong
2024-09-04 22:24 ` Bernd Schubert
2024-09-06 19:23 ` Joanne Koong
2024-09-01 13:37 ` [PATCH RFC v3 06/17] fuse: Add the queue configuration ioctl Bernd Schubert
2024-09-04 22:23 ` Joanne Koong
2024-09-04 22:38 ` Bernd Schubert
2024-09-04 22:42 ` Joanne Koong
2024-09-01 13:37 ` [PATCH RFC v3 07/17] fuse: {uring} Add a dev_release exception for fuse-over-io-uring Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 08/17] fuse: {uring} Handle SQEs - register commands Bernd Schubert
2024-09-04 15:40 ` Jens Axboe
2024-09-01 13:37 ` [PATCH RFC v3 09/17] fuse: Make fuse_copy non static Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 10/17] fuse: Add buffer offset for uring into fuse_copy_state Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 11/17] fuse: {uring} Add uring sqe commit and fetch support Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 12/17] fuse: {uring} Handle teardown of ring entries Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 13/17] fuse: {uring} Add a ring queue and send method Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 14/17] fuse: {uring} Allow to queue to the ring Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 15/17] ate: 2024-08-30 15:43:32 +0100 Bernd Schubert
2024-09-04 15:43 ` Jens Axboe
2024-09-04 15:54 ` Bernd Schubert
2024-09-01 13:37 ` [PATCH RFC v3 16/17] fuse: {uring} Handle IO_URING_F_TASK_DEAD Bernd Schubert
2024-09-01 13:37 ` Bernd Schubert [this message]
2024-09-04 15:47 ` [PATCH RFC v3 17/17] fuse: {uring} Pin the user buffer Jens Axboe
2024-09-04 16:08 ` Bernd Schubert
2024-09-04 16:16 ` Jens Axboe
2024-09-04 19:25 ` Bernd Schubert
2024-09-04 19:40 ` Jens Axboe
2024-09-05 21:04 ` Bernd Schubert
2024-09-04 18:59 ` Jens Axboe
2024-09-04 16:42 ` [PATCH RFC v3 00/17] fuse: fuse-over-io-uring Jens Axboe
2024-09-04 19:37 ` Bernd Schubert
2024-09-04 19:41 ` Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240901-b4-fuse-uring-rfcv3-without-mmap-v3-17-9207f7391444@ddn.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox