* [PATCH 1/5] io_uring: add general async offload context
2019-12-03 2:54 [PATCHSET v2 0/5] io_uring: ensure all needed read/write data is stable Jens Axboe
@ 2019-12-03 2:54 ` Jens Axboe
2019-12-03 2:54 ` [PATCH 2/5] io_uring: ensure async punted read/write requests copy iovec Jens Axboe
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2019-12-03 2:54 UTC (permalink / raw)
To: io-uring; +Cc: netdev, Jens Axboe
Right now we just copy the sqe for async offload, but we want to store
more context across an async punt. In preparation for doing so, put the
sqe copy inside a structure that we can expand. With this pointer added,
we can get rid of REQ_F_FREE_SQE, as that is now indicated by whether
req->io is NULL or not.
No functional changes in this patch.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 56 +++++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a91743e1fa2c..bbbd9f664b1e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,6 +308,10 @@ struct io_timeout {
struct io_timeout_data *data;
};
+struct io_async_ctx {
+ struct io_uring_sqe sqe;
+};
+
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
@@ -323,6 +327,7 @@ struct io_kiocb {
};
const struct io_uring_sqe *sqe;
+ struct io_async_ctx *io;
struct file *ring_file;
int ring_fd;
bool has_user;
@@ -353,7 +358,6 @@ struct io_kiocb {
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
-#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data;
u32 result;
u32 sequence;
@@ -806,6 +810,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
}
got_it:
+ req->io = NULL;
req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
@@ -836,8 +841,8 @@ static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- if (req->flags & REQ_F_FREE_SQE)
- kfree(req->sqe);
+ if (req->io)
+ kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
@@ -1079,9 +1084,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
- if (((req->flags &
- (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) ==
- REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) {
+ if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
+ REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
+ !req->io) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
@@ -2259,7 +2264,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->wait)
return -ENOMEM;
- req->sqe = NULL;
+ req->io = NULL;
INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
@@ -2602,27 +2607,27 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
static int io_req_defer(struct io_kiocb *req)
{
- struct io_uring_sqe *sqe_copy;
struct io_ring_ctx *ctx = req->ctx;
+ struct io_async_ctx *io;
/* Still need defer if there is pending req in defer list. */
if (!req_need_defer(req) && list_empty(&ctx->defer_list))
return 0;
- sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL);
- if (!sqe_copy)
+ io = kmalloc(sizeof(*io), GFP_KERNEL);
+ if (!io)
return -EAGAIN;
spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock);
- kfree(sqe_copy);
+ kfree(io);
return 0;
}
- memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
- req->flags |= REQ_F_FREE_SQE;
- req->sqe = sqe_copy;
+ memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+ req->sqe = &io->sqe;
+ req->io = io;
trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list);
@@ -2955,14 +2960,16 @@ static void __io_queue_sqe(struct io_kiocb *req)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
- struct io_uring_sqe *sqe_copy;
+ struct io_async_ctx *io;
- sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
- if (!sqe_copy)
+ io = kmalloc(sizeof(*io), GFP_KERNEL);
+ if (!io)
goto err;
- req->sqe = sqe_copy;
- req->flags |= REQ_F_FREE_SQE;
+ memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+
+ req->sqe = &io->sqe;
+ req->io = io;
if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
ret = io_grab_files(req);
@@ -3063,7 +3070,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
*/
if (*link) {
struct io_kiocb *prev = *link;
- struct io_uring_sqe *sqe_copy;
+ struct io_async_ctx *io;
if (req->sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
@@ -3079,14 +3086,15 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
}
}
- sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
- if (!sqe_copy) {
+ io = kmalloc(sizeof(*io), GFP_KERNEL);
+ if (!io) {
ret = -EAGAIN;
goto err_req;
}
- req->sqe = sqe_copy;
- req->flags |= REQ_F_FREE_SQE;
+ memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+ req->sqe = &io->sqe;
+ req->io = io;
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
} else if (req->sqe->flags & IOSQE_IO_LINK) {
--
2.24.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/5] io_uring: ensure async punted read/write requests copy iovec
2019-12-03 2:54 [PATCHSET v2 0/5] io_uring: ensure all needed read/write data is stable Jens Axboe
2019-12-03 2:54 ` [PATCH 1/5] io_uring: add general async offload context Jens Axboe
@ 2019-12-03 2:54 ` Jens Axboe
2019-12-03 2:54 ` [PATCH 3/5] io_uring: ensure async punted sendmsg/recvmsg requests copy data Jens Axboe
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2019-12-03 2:54 UTC (permalink / raw)
To: io-uring; +Cc: netdev, Jens Axboe, 李通洲
Currently we don't copy the iovecs when we punt to async context. This
can be problematic for applications that store the iovec on the stack,
as they often assume that it's safe to let the iovec go out of scope
as soon as IO submission has been called. This isn't always safe, as we
will re-copy the iovec once we're in async context.
Make this 100% safe by copying the iovec just once. With this change,
applications may safely store the iovec on the stack for all cases.
Reported-by: 李通洲 <[email protected]>
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 241 +++++++++++++++++++++++++++++++++++++-------------
1 file changed, 179 insertions(+), 62 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bbbd9f664b1e..bd8fab9277d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,8 +308,18 @@ struct io_timeout {
struct io_timeout_data *data;
};
+struct io_async_rw {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ ssize_t nr_segs;
+ ssize_t size;
+};
+
struct io_async_ctx {
struct io_uring_sqe sqe;
+ union {
+ struct io_async_rw rw;
+ };
};
/*
@@ -1415,15 +1425,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
if (S_ISREG(file_inode(req->file)->i_mode))
req->flags |= REQ_F_ISREG;
- /*
- * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
- * we know to async punt it even if it was opened O_NONBLOCK
- */
- if (force_nonblock && !io_file_supports_async(req->file)) {
- req->flags |= REQ_F_MUST_PUNT;
- return -EAGAIN;
- }
-
kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -1592,6 +1593,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req->ctx, rw, sqe, iter);
}
+ if (req->io) {
+ struct io_async_rw *iorw = &req->io->rw;
+
+ *iovec = iorw->iov;
+ iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
+ if (iorw->iov == iorw->fast_iov)
+ *iovec = NULL;
+ return iorw->size;
+ }
+
if (!req->has_user)
return -EFAULT;
@@ -1662,6 +1673,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret;
}
+static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter)
+{
+ req->io->rw.nr_segs = iter->nr_segs;
+ req->io->rw.size = io_size;
+ req->io->rw.iov = iovec;
+ if (!req->io->rw.iov) {
+ req->io->rw.iov = req->io->rw.fast_iov;
+ memcpy(req->io->rw.iov, fast_iov,
+ sizeof(struct iovec) * iter->nr_segs);
+ }
+}
+
+static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size,
+ struct iovec *iovec, struct iovec *fast_iov,
+ struct iov_iter *iter)
+{
+ req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
+ if (req->io) {
+ io_req_map_io(req, io_size, iovec, fast_iov, iter);
+ memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
+ req->sqe = &req->io->sqe;
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
+ struct iov_iter *iter, bool force_nonblock)
+{
+ ssize_t ret;
+
+ ret = io_prep_rw(req, force_nonblock);
+ if (ret)
+ return ret;
+
+ if (unlikely(!(req->file->f_mode & FMODE_READ)))
+ return -EBADF;
+
+ return io_import_iovec(READ, req, iovec, iter);
+}
+
static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
@@ -1670,23 +1725,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t read_size, ret;
+ ssize_t io_size, ret;
- ret = io_prep_rw(req, force_nonblock);
- if (ret)
- return ret;
- file = kiocb->ki_filp;
-
- if (unlikely(!(file->f_mode & FMODE_READ)))
- return -EBADF;
-
- ret = io_import_iovec(READ, req, &iovec, &iter);
- if (ret < 0)
- return ret;
+ if (!req->io) {
+ ret = io_read_prep(req, &iovec, &iter, force_nonblock);
+ if (ret < 0)
+ return ret;
+ } else {
+ ret = io_import_iovec(READ, req, &iovec, &iter);
+ if (ret < 0)
+ return ret;
+ }
- read_size = ret;
+ file = req->file;
+ io_size = ret;
if (req->flags & REQ_F_LINK)
- req->result = read_size;
+ req->result = io_size;
+
+ /*
+ * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+ * we know to async punt it even if it was opened O_NONBLOCK
+ */
+ if (force_nonblock && !io_file_supports_async(file)) {
+ req->flags |= REQ_F_MUST_PUNT;
+ goto copy_iov;
+ }
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
@@ -1708,18 +1771,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
*/
if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
(req->flags & REQ_F_ISREG) &&
- ret2 > 0 && ret2 < read_size)
+ ret2 > 0 && ret2 < io_size)
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
- if (!force_nonblock || ret2 != -EAGAIN)
+ if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
- else
- ret = -EAGAIN;
+ } else {
+copy_iov:
+ ret = io_setup_async_io(req, io_size, iovec,
+ inline_vecs, &iter);
+ if (ret)
+ goto out_free;
+ return -EAGAIN;
+ }
}
+out_free:
kfree(iovec);
return ret;
}
+static int io_write_prep(struct io_kiocb *req, struct iovec **iovec,
+ struct iov_iter *iter, bool force_nonblock)
+{
+ ssize_t ret;
+
+ ret = io_prep_rw(req, force_nonblock);
+ if (ret)
+ return ret;
+
+ if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ return io_import_iovec(WRITE, req, iovec, iter);
+}
+
static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
@@ -1728,29 +1813,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
struct iov_iter iter;
struct file *file;
size_t iov_count;
- ssize_t ret;
+ ssize_t ret, io_size;
- ret = io_prep_rw(req, force_nonblock);
- if (ret)
- return ret;
+ if (!req->io) {
+ ret = io_write_prep(req, &iovec, &iter, force_nonblock);
+ if (ret < 0)
+ return ret;
+ } else {
+ ret = io_import_iovec(WRITE, req, &iovec, &iter);
+ if (ret < 0)
+ return ret;
+ }
file = kiocb->ki_filp;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- return -EBADF;
-
- ret = io_import_iovec(WRITE, req, &iovec, &iter);
- if (ret < 0)
- return ret;
-
+ io_size = ret;
if (req->flags & REQ_F_LINK)
- req->result = ret;
+ req->result = io_size;
- iov_count = iov_iter_count(&iter);
+ /*
+ * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
+ * we know to async punt it even if it was opened O_NONBLOCK
+ */
+ if (force_nonblock && !io_file_supports_async(req->file)) {
+ req->flags |= REQ_F_MUST_PUNT;
+ goto copy_iov;
+ }
- ret = -EAGAIN;
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
- goto out_free;
+ goto copy_iov;
+ iov_count = iov_iter_count(&iter);
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) {
ssize_t ret2;
@@ -1774,10 +1866,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
ret2 = call_write_iter(file, kiocb, &iter);
else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
- if (!force_nonblock || ret2 != -EAGAIN)
+ if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async);
- else
- ret = -EAGAIN;
+ } else {
+copy_iov:
+ ret = io_setup_async_io(req, io_size, iovec,
+ inline_vecs, &iter);
+ if (ret)
+ goto out_free;
+ return -EAGAIN;
+ }
}
out_free:
kfree(iovec);
@@ -2605,10 +2703,40 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0;
}
+static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
+ req->sqe = &io->sqe;
+
+ switch (io->sqe.opcode) {
+ case IORING_OP_READV:
+ ret = io_read_prep(req, &iovec, &iter, true);
+ break;
+ case IORING_OP_WRITEV:
+ ret = io_write_prep(req, &iovec, &iter, true);
+ break;
+ default:
+ req->io = io;
+ return 0;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ req->io = io;
+ io_req_map_io(req, ret, iovec, inline_vecs, &iter);
+ return 0;
+}
+
static int io_req_defer(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_async_ctx *io;
+ int ret;
/* Still need defer if there is pending req in defer list. */
if (!req_need_defer(req) && list_empty(&ctx->defer_list))
@@ -2625,9 +2753,9 @@ static int io_req_defer(struct io_kiocb *req)
return 0;
}
- memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
- req->sqe = &io->sqe;
- req->io = io;
+ ret = io_req_defer_prep(req, io);
+ if (ret < 0)
+ return ret;
trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list);
@@ -2960,17 +3088,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
- struct io_async_ctx *io;
-
- io = kmalloc(sizeof(*io), GFP_KERNEL);
- if (!io)
- goto err;
-
- memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
-
- req->sqe = &io->sqe;
- req->io = io;
-
if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
ret = io_grab_files(req);
if (ret)
@@ -3092,9 +3209,9 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
goto err_req;
}
- memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
- req->sqe = &io->sqe;
- req->io = io;
+ ret = io_req_defer_prep(req, io);
+ if (ret)
+ goto err_req;
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list);
} else if (req->sqe->flags & IOSQE_IO_LINK) {
--
2.24.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/5] io_uring: ensure async punted sendmsg/recvmsg requests copy data
2019-12-03 2:54 [PATCHSET v2 0/5] io_uring: ensure all needed read/write data is stable Jens Axboe
2019-12-03 2:54 ` [PATCH 1/5] io_uring: add general async offload context Jens Axboe
2019-12-03 2:54 ` [PATCH 2/5] io_uring: ensure async punted read/write requests copy iovec Jens Axboe
@ 2019-12-03 2:54 ` Jens Axboe
2019-12-03 2:54 ` [PATCH 4/5] io_uring: ensure async punted connect " Jens Axboe
2019-12-03 2:54 ` [PATCH 5/5] io_uring: mark us with IORING_FEAT_SUBMIT_STABLE Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2019-12-03 2:54 UTC (permalink / raw)
To: io-uring; +Cc: netdev, Jens Axboe
Just like commit bd26dacbd5ce for read/write requests, this one ensures
that the msghdr data is fully copied if we need to punt a recvmsg or
sendmsg system call to async context.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 145 +++++++++++++++++++++++++++++++++++------
include/linux/socket.h | 15 +++--
net/socket.c | 60 +++++------------
3 files changed, 152 insertions(+), 68 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bd8fab9277d6..11d181ed2076 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,6 +308,13 @@ struct io_timeout {
struct io_timeout_data *data;
};
+struct io_async_msghdr {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov;
+ struct sockaddr __user *uaddr;
+ struct msghdr msg;
+};
+
struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
struct iovec *iov;
@@ -319,6 +326,7 @@ struct io_async_ctx {
struct io_uring_sqe sqe;
union {
struct io_async_rw rw;
+ struct io_async_msghdr msg;
};
};
@@ -1991,12 +1999,21 @@ static int io_sync_file_range(struct io_kiocb *req,
return 0;
}
-#if defined(CONFIG_NET)
-static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock,
- long (*fn)(struct socket *, struct user_msghdr __user *,
- unsigned int))
+static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
{
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct user_msghdr __user *msg;
+ unsigned flags;
+
+ flags = READ_ONCE(sqe->msg_flags);
+ msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+ return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
+}
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_kiocb **nxt, bool force_nonblock)
+{
+#if defined(CONFIG_NET)
struct socket *sock;
int ret;
@@ -2005,7 +2022,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sock = sock_from_file(req->file, &ret);
if (sock) {
- struct user_msghdr __user *msg;
+ struct io_async_ctx io, *copy;
+ struct sockaddr_storage addr;
+ struct msghdr *kmsg;
unsigned flags;
flags = READ_ONCE(sqe->msg_flags);
@@ -2014,41 +2033,119 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
else if (force_nonblock)
flags |= MSG_DONTWAIT;
- msg = (struct user_msghdr __user *) (unsigned long)
- READ_ONCE(sqe->addr);
+ if (req->io) {
+ kmsg = &req->io->msg.msg;
+ kmsg->msg_name = &addr;
+ } else {
+ kmsg = &io.msg.msg;
+ kmsg->msg_name = &addr;
+ io.msg.iov = io.msg.fast_iov;
+ ret = io_sendmsg_prep(req, &io);
+ if (ret)
+ goto out;
+ }
- ret = fn(sock, msg, flags);
- if (force_nonblock && ret == -EAGAIN)
+ ret = __sys_sendmsg_sock(sock, kmsg, flags);
+ if (force_nonblock && ret == -EAGAIN) {
+ copy = kmalloc(sizeof(*copy), GFP_KERNEL);
+ if (!copy) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(©->msg, &io.msg, sizeof(copy->msg));
+ req->io = copy;
+ memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
+ req->sqe = &req->io->sqe;
return ret;
+ }
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
+out:
io_cqring_add_event(req, ret);
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
io_put_req_find_next(req, nxt);
return 0;
-}
-#endif
-
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
- struct io_kiocb **nxt, bool force_nonblock)
-{
-#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, nxt, force_nonblock,
- __sys_sendmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
+static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct user_msghdr __user *msg;
+ unsigned flags;
+
+ flags = READ_ONCE(sqe->msg_flags);
+ msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
+ return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
+ &io->msg.iov);
+}
+
static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- return io_send_recvmsg(req, sqe, nxt, force_nonblock,
- __sys_recvmsg_sock);
+ struct socket *sock;
+ int ret;
+
+ if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
+ sock = sock_from_file(req->file, &ret);
+ if (sock) {
+ struct user_msghdr __user *msg;
+ struct io_async_ctx io, *copy;
+ struct sockaddr_storage addr;
+ struct msghdr *kmsg;
+ unsigned flags;
+
+ flags = READ_ONCE(sqe->msg_flags);
+ if (flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ else if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+
+ msg = (struct user_msghdr __user *) (unsigned long)
+ READ_ONCE(sqe->addr);
+ if (req->io) {
+ kmsg = &req->io->msg.msg;
+ kmsg->msg_name = &addr;
+ } else {
+ kmsg = &io.msg.msg;
+ kmsg->msg_name = &addr;
+ io.msg.iov = io.msg.fast_iov;
+ ret = io_recvmsg_prep(req, &io);
+ if (ret)
+ goto out;
+ }
+
+ ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags);
+ if (force_nonblock && ret == -EAGAIN) {
+ copy = kmalloc(sizeof(*copy), GFP_KERNEL);
+ if (!copy) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(copy, &io, sizeof(*copy));
+ req->io = copy;
+ memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
+ req->sqe = &req->io->sqe;
+ return ret;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ }
+
+out:
+ io_cqring_add_event(req, ret);
+ if (ret < 0 && (req->flags & REQ_F_LINK))
+ req->flags |= REQ_F_FAIL_LINK;
+ io_put_req_find_next(req, nxt);
+ return 0;
#else
return -EOPNOTSUPP;
#endif
@@ -2719,6 +2816,12 @@ static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
case IORING_OP_WRITEV:
ret = io_write_prep(req, &iovec, &iter, true);
break;
+ case IORING_OP_SENDMSG:
+ ret = io_sendmsg_prep(req, io);
+ break;
+ case IORING_OP_RECVMSG:
+ ret = io_recvmsg_prep(req, io);
+ break;
default:
req->io = io;
return 0;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 4bde63021c09..903507fb901f 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -378,12 +378,19 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags,
bool forbid_cmsg_compat);
-extern long __sys_sendmsg_sock(struct socket *sock,
- struct user_msghdr __user *msg,
+extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags);
-extern long __sys_recvmsg_sock(struct socket *sock,
- struct user_msghdr __user *msg,
+extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user *uaddr,
unsigned int flags);
+extern int sendmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct iovec **iov);
+extern int recvmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct sockaddr __user **uaddr,
+ struct iovec **iov);
/* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/net/socket.c b/net/socket.c
index ea28cbb9e2e7..0fb0820edeec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2346,9 +2346,9 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
return err;
}
-static int sendmsg_copy_msghdr(struct msghdr *msg,
- struct user_msghdr __user *umsg, unsigned flags,
- struct iovec **iov)
+int sendmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct iovec **iov)
{
int err;
@@ -2390,27 +2390,14 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
/*
* BSD sendmsg interface
*/
-long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *umsg,
+long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags)
{
- struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
- struct sockaddr_storage address;
- struct msghdr msg = { .msg_name = &address };
- ssize_t err;
-
- err = sendmsg_copy_msghdr(&msg, umsg, flags, &iov);
- if (err)
- return err;
/* disallow ancillary data requests from this path */
- if (msg.msg_control || msg.msg_controllen) {
- err = -EINVAL;
- goto out;
- }
+ if (msg->msg_control || msg->msg_controllen)
+ return -EINVAL;
- err = ____sys_sendmsg(sock, &msg, flags, NULL, 0);
-out:
- kfree(iov);
- return err;
+ return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
@@ -2516,10 +2503,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
}
-static int recvmsg_copy_msghdr(struct msghdr *msg,
- struct user_msghdr __user *umsg, unsigned flags,
- struct sockaddr __user **uaddr,
- struct iovec **iov)
+int recvmsg_copy_msghdr(struct msghdr *msg,
+ struct user_msghdr __user *umsg, unsigned flags,
+ struct sockaddr __user **uaddr,
+ struct iovec **iov)
{
ssize_t err;
@@ -2609,28 +2596,15 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
* BSD recvmsg interface
*/
-long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *umsg,
- unsigned int flags)
+long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user *uaddr, unsigned int flags)
{
- struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
- struct sockaddr_storage address;
- struct msghdr msg = { .msg_name = &address };
- struct sockaddr __user *uaddr;
- ssize_t err;
-
- err = recvmsg_copy_msghdr(&msg, umsg, flags, &uaddr, &iov);
- if (err)
- return err;
/* disallow ancillary data requests from this path */
- if (msg.msg_control || msg.msg_controllen) {
- err = -EINVAL;
- goto out;
- }
+ if (msg->msg_control || msg->msg_controllen)
+ return -EINVAL;
- err = ____sys_recvmsg(sock, &msg, umsg, uaddr, flags, 0);
-out:
- kfree(iov);
- return err;
+ return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
--
2.24.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 4/5] io_uring: ensure async punted connect requests copy data
2019-12-03 2:54 [PATCHSET v2 0/5] io_uring: ensure all needed read/write data is stable Jens Axboe
` (2 preceding siblings ...)
2019-12-03 2:54 ` [PATCH 3/5] io_uring: ensure async punted sendmsg/recvmsg requests copy data Jens Axboe
@ 2019-12-03 2:54 ` Jens Axboe
2019-12-03 2:54 ` [PATCH 5/5] io_uring: mark us with IORING_FEAT_SUBMIT_STABLE Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2019-12-03 2:54 UTC (permalink / raw)
To: io-uring; +Cc: netdev, Jens Axboe
Just like commit bd26dacbd5ce for read/write requests, this one ensures
that the sockaddr data has been copied for IORING_OP_CONNECT if we need
to punt the request to async context.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 47 ++++++++++++++++++++++++++++++++++++++----
include/linux/socket.h | 5 ++---
net/socket.c | 16 +++++++-------
3 files changed, 53 insertions(+), 15 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11d181ed2076..d5cd338ac8bf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -308,6 +308,10 @@ struct io_timeout {
struct io_timeout_data *data;
};
+struct io_async_connect {
+ struct sockaddr_storage address;
+};
+
struct io_async_msghdr {
struct iovec fast_iov[UIO_FASTIOV];
struct iovec *iov;
@@ -327,6 +331,7 @@ struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
+ struct io_async_connect connect;
};
};
@@ -2187,11 +2192,22 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
#endif
}
+static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
+{
+ const struct io_uring_sqe *sqe = req->sqe;
+ struct sockaddr __user *addr;
+ int addr_len;
+
+ addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
+ addr_len = READ_ONCE(sqe->addr2);
+ return move_addr_to_kernel(addr, addr_len, &io->connect.address);
+}
+
static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock)
{
#if defined(CONFIG_NET)
- struct sockaddr __user *addr;
+ struct io_async_ctx __io, *io;
unsigned file_flags;
int addr_len, ret;
@@ -2200,15 +2216,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
- addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
addr_len = READ_ONCE(sqe->addr2);
file_flags = force_nonblock ? O_NONBLOCK : 0;
- ret = __sys_connect_file(req->file, addr, addr_len, file_flags);
- if (ret == -EAGAIN && force_nonblock)
+ if (req->io) {
+ io = req->io;
+ } else {
+ ret = io_connect_prep(req, &__io);
+ if (ret)
+ goto out;
+ io = &__io;
+ }
+
+ ret = __sys_connect_file(req->file, &io->connect.address, addr_len,
+ file_flags);
+ if (ret == -EAGAIN && force_nonblock) {
+ io = kmalloc(sizeof(*io), GFP_KERNEL);
+ if (!io) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(&io->connect, &__io.connect, sizeof(io->connect));
+ req->io = io;
+ memcpy(&io->sqe, req->sqe, sizeof(*req->sqe));
+ req->sqe = &io->sqe;
return -EAGAIN;
+ }
if (ret == -ERESTARTSYS)
ret = -EINTR;
+out:
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req, ret);
@@ -2822,6 +2858,9 @@ static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
case IORING_OP_RECVMSG:
ret = io_recvmsg_prep(req, io);
break;
+ case IORING_OP_CONNECT:
+ ret = io_connect_prep(req, io);
+ break;
default:
req->io = io;
return 0;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 903507fb901f..2d2313403101 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -406,9 +406,8 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
-extern int __sys_connect_file(struct file *file,
- struct sockaddr __user *uservaddr, int addrlen,
- int file_flags);
+extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
+ int addrlen, int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen);
extern int __sys_listen(int fd, int backlog);
diff --git a/net/socket.c b/net/socket.c
index 0fb0820edeec..b343db1489bd 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1826,26 +1826,22 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
* include the -EINPROGRESS status for such sockets.
*/
-int __sys_connect_file(struct file *file, struct sockaddr __user *uservaddr,
+int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
int addrlen, int file_flags)
{
struct socket *sock;
- struct sockaddr_storage address;
int err;
sock = sock_from_file(file, &err);
if (!sock)
goto out;
- err = move_addr_to_kernel(uservaddr, addrlen, &address);
- if (err < 0)
- goto out;
err =
- security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
+ security_socket_connect(sock, (struct sockaddr *)address, addrlen);
if (err)
goto out;
- err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
+ err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
sock->file->f_flags | file_flags);
out:
return err;
@@ -1858,7 +1854,11 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
f = fdget(fd);
if (f.file) {
- ret = __sys_connect_file(f.file, uservaddr, addrlen, 0);
+ struct sockaddr_storage address;
+
+ ret = move_addr_to_kernel(uservaddr, addrlen, &address);
+ if (!ret)
+ ret = __sys_connect_file(f.file, &address, addrlen, 0);
if (f.flags)
fput(f.file);
}
--
2.24.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 5/5] io_uring: mark us with IORING_FEAT_SUBMIT_STABLE
2019-12-03 2:54 [PATCHSET v2 0/5] io_uring: ensure all needed read/write data is stable Jens Axboe
` (3 preceding siblings ...)
2019-12-03 2:54 ` [PATCH 4/5] io_uring: ensure async punted connect " Jens Axboe
@ 2019-12-03 2:54 ` Jens Axboe
4 siblings, 0 replies; 6+ messages in thread
From: Jens Axboe @ 2019-12-03 2:54 UTC (permalink / raw)
To: io-uring; +Cc: netdev, Jens Axboe
If this flag is set, applications can be certain that any data for
async offload has been consumed when the kernel has consumed the
SQE.
Signed-off-by: Jens Axboe <[email protected]>
---
fs/io_uring.c | 3 ++-
include/uapi/linux/io_uring.h | 1 +
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d5cd338ac8bf..cc3bfa13a1f3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5063,7 +5063,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret < 0)
goto err;
- p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP;
+ p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
+ IORING_FEAT_SUBMIT_STABLE;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4637ed1d9949..eabccb46edd1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -157,6 +157,7 @@ struct io_uring_params {
*/
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
+#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
/*
* io_uring_register(2) opcodes and arguments
--
2.24.0
^ permalink raw reply related [flat|nested] 6+ messages in thread