* [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-17 13:45 [PATCHSET RFC for-next 0/2] Add direct descriptor ring passing Jens Axboe
@ 2022-06-17 13:45 ` Jens Axboe
2022-06-18 11:02 ` Hao Xu
0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-17 13:45 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Jens Axboe
With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.
Arguments are extended to pass in:
sqe->addr3 fixed file slot in source ring
sqe->file_index fixed file slot in destination ring
IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.
Undecided:
- Should we post a cqe with the send, or require that the sender
just link a separate IORING_OP_MSG_RING? This makes error
handling easier, as we cannot easily retract the installed
file descriptor if the target CQ ring is full. Right now we do
fill a CQE. If the request completes with -EOVERFLOW, then the
sender must re-send a CQE if the target must get notified.
- Add an IORING_MSG_MOVE_FD which moves the descriptor, removing
it from the source ring when installed in the target? Again
error handling is difficult.
Signed-off-by: Jens Axboe <[email protected]>
---
include/uapi/linux/io_uring.h | 8 +++
io_uring/msg_ring.c | 122 ++++++++++++++++++++++++++++++++--
2 files changed, 123 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8715f0942ec2..dbdaeef3ea89 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -264,6 +264,14 @@ enum io_uring_op {
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+ IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
+ IORING_MSG_SEND_FD, /* send a registered fd to another ring */
+};
+
/*
* IO completion data structure (Completion Queue Entry)
*/
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be2349652..e9d6fb25d141 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,154 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
#include "msg_ring.h"
struct io_msg {
struct file *file;
u64 user_data;
u32 len;
+ u32 cmd;
+ u32 src_fd;
+ u32 dst_fd;
};
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+
+ if (msg->src_fd || msg->dst_fd)
+ return -EINVAL;
+
+ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ return 0;
+
+ return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+ mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ /*
+ * To ensure proper ordering between the two ctxs, we can only
+ * attempt a trylock on the target. If that fails and we already have
+ * the source ctx lock, punt to io-wq.
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ if (!mutex_trylock(&octx->uring_lock))
+ return -EAGAIN;
+ return 0;
+ }
+
+ /* Always grab smallest value ctx first. */
+ if (ctx < octx) {
+ mutex_lock(&ctx->uring_lock);
+ mutex_lock(&octx->uring_lock);
+ } else if (ctx > octx) {
+ mutex_lock(&octx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long file_ptr;
+ struct file *src_file;
+ int ret;
+
+ if (target_ctx == ctx)
+ return -EINVAL;
+
+ ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+ if (unlikely(ret))
+ return ret;
+
+ ret = -EBADF;
+ if (unlikely(msg->src_fd >= ctx->nr_user_files))
+ goto err_unlock;
+
+ msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+ src_file = (struct file *) (file_ptr & FFS_MASK);
+ get_file(src_file);
+
+ ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+ if (ret < 0) {
+ fput(src_file);
+ goto err_unlock;
+ }
+
+ /*
+ * If this fails, the target still received the file descriptor but
+ * wasn't notified of the fact. This means that if this request
+ * completes with -EOVERFLOW, then the sender must ensure that a
+ * later IORING_OP_MSG_RING delivers the message.
+ */
+ if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ ret = -EOVERFLOW;
+err_unlock:
+ io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+ return ret;
+}
+
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
- sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->rw_flags || sqe->buf_index || sqe->personality))
return -EINVAL;
msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len);
+ msg->cmd = READ_ONCE(sqe->addr);
+ msg->src_fd = READ_ONCE(sqe->addr3);
+ msg->dst_fd = READ_ONCE(sqe->file_index);
return 0;
}
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- struct io_ring_ctx *target_ctx;
int ret;
ret = -EBADFD;
if (!io_is_uring_fops(req->file))
goto done;
- ret = -EOVERFLOW;
- target_ctx = req->file->private_data;
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
- ret = 0;
+ switch (msg->cmd) {
+ case IORING_MSG_DATA:
+ ret = io_msg_ring_data(req);
+ break;
+ case IORING_MSG_SEND_FD:
+ ret = io_msg_send_fd(req, issue_flags);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
done:
if (ret < 0)
--
2.35.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-17 13:45 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
@ 2022-06-18 11:02 ` Hao Xu
2022-06-18 11:34 ` Jens Axboe
0 siblings, 1 reply; 12+ messages in thread
From: Hao Xu @ 2022-06-18 11:02 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: asml.silence
On 6/17/22 21:45, Jens Axboe wrote:
> With IORING_OP_MSG_RING, one ring can send a message to another ring.
> Extend that support to also allow sending a fixed file descriptor to
> that ring, enabling one ring to pass a registered descriptor to another
> one.
>
> Arguments are extended to pass in:
>
> sqe->addr3 fixed file slot in source ring
> sqe->file_index fixed file slot in destination ring
>
> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
> to the above arguments.
>
> Undecided:
> - Should we post a cqe with the send, or require that the sender
> just link a separate IORING_OP_MSG_RING? This makes error
> handling easier, as we cannot easily retract the installed
> file descriptor if the target CQ ring is full. Right now we do
> fill a CQE. If the request completes with -EOVERFLOW, then the
> sender must re-send a CQE if the target must get notified.
Hi Jens,
Since we are have open/accept direct feature, this may be useful. But I
just can't think of a real case that people use two rings and need to do
operations to same fd.
Assume there are real cases, then filling a cqe is necessary since users
need to first make sure the desired fd is registered before doing
something to it.
A downside is users have to take care to do fd delivery especially
when slot resource is in short supply in target_ctx.
ctx target_ctx
msg1(fd1 to target slot x)
msg2(fd2 to target slot x)
get cqe of msg1
do something to fd1 by access slot x
the msg2 is issued not at the right time. In short not only ctx needs to
fill a cqe to target_ctx to inform that the file has been registered
but also the target_ctx has to tell ctx that "my slot x is free now
for you to deliver fd". So I guess users are inclined to allocate a
big fixed table and deliver fds to target_ctx in different slots,
Which is ok but anyway a limitation.
>
> - Add an IORING_MSG_MOVE_FD which moves the descriptor, removing
> it from the source ring when installed in the target? Again
> error handling is difficult.
>
> Signed-off-by: Jens Axboe <[email protected]>
> ---
> include/uapi/linux/io_uring.h | 8 +++
> io_uring/msg_ring.c | 122 ++++++++++++++++++++++++++++++++--
> 2 files changed, 123 insertions(+), 7 deletions(-)
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 8715f0942ec2..dbdaeef3ea89 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -264,6 +264,14 @@ enum io_uring_op {
> */
> #define IORING_ACCEPT_MULTISHOT (1U << 0)
>
> +/*
> + * IORING_OP_MSG_RING command types, stored in sqe->addr
> + */
> +enum {
> + IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
> + IORING_MSG_SEND_FD, /* send a registered fd to another ring */
> +};
> +
> /*
> * IO completion data structure (Completion Queue Entry)
> */
> diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
> index b02be2349652..e9d6fb25d141 100644
> --- a/io_uring/msg_ring.c
> +++ b/io_uring/msg_ring.c
> @@ -3,46 +3,154 @@
> #include <linux/errno.h>
> #include <linux/file.h>
> #include <linux/slab.h>
> +#include <linux/nospec.h>
> #include <linux/io_uring.h>
>
> #include <uapi/linux/io_uring.h>
>
> #include "io_uring.h"
> +#include "rsrc.h"
> +#include "filetable.h"
> #include "msg_ring.h"
>
> struct io_msg {
> struct file *file;
> u64 user_data;
> u32 len;
> + u32 cmd;
> + u32 src_fd;
> + u32 dst_fd;
> };
>
> +static int io_msg_ring_data(struct io_kiocb *req)
> +{
> + struct io_ring_ctx *target_ctx = req->file->private_data;
> + struct io_msg *msg = io_kiocb_to_cmd(req);
> +
> + if (msg->src_fd || msg->dst_fd)
> + return -EINVAL;
> +
> + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
> + return 0;
> +
> + return -EOVERFLOW;
> +}
> +
> +static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
> + struct io_ring_ctx *octx,
> + unsigned int issue_flags)
> +{
> + if (issue_flags & IO_URING_F_UNLOCKED)
> + mutex_unlock(&ctx->uring_lock);
> + mutex_unlock(&octx->uring_lock);
> +}
> +
> +static int io_double_lock_ctx(struct io_ring_ctx *ctx,
> + struct io_ring_ctx *octx,
> + unsigned int issue_flags)
> +{
> + /*
> + * To ensure proper ordering between the two ctxs, we can only
> + * attempt a trylock on the target. If that fails and we already have
> + * the source ctx lock, punt to io-wq.
> + */
> + if (!(issue_flags & IO_URING_F_UNLOCKED)) {
> + if (!mutex_trylock(&octx->uring_lock))
> + return -EAGAIN;
> + return 0;
> + }
> +
> + /* Always grab smallest value ctx first. */
> + if (ctx < octx) {
> + mutex_lock(&ctx->uring_lock);
> + mutex_lock(&octx->uring_lock);
> + } else if (ctx > octx) {
Would a simple else work?
if (a < b) {
lock(a); lock(b);
} else {
lock(b);lock(a);
}
since a doesn't equal b
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 11:02 ` Hao Xu
@ 2022-06-18 11:34 ` Jens Axboe
2022-06-18 12:47 ` Hao Xu
0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-18 11:34 UTC (permalink / raw)
To: Hao Xu, io-uring; +Cc: asml.silence
On 6/18/22 5:02 AM, Hao Xu wrote:
> On 6/17/22 21:45, Jens Axboe wrote:
>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>> Extend that support to also allow sending a fixed file descriptor to
>> that ring, enabling one ring to pass a registered descriptor to another
>> one.
>>
>> Arguments are extended to pass in:
>>
>> sqe->addr3 fixed file slot in source ring
>> sqe->file_index fixed file slot in destination ring
>>
>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>> to the above arguments.
>>
>> Undecided:
>> - Should we post a cqe with the send, or require that the sender
>> just link a separate IORING_OP_MSG_RING? This makes error
>> handling easier, as we cannot easily retract the installed
>> file descriptor if the target CQ ring is full. Right now we do
>> fill a CQE. If the request completes with -EOVERFLOW, then the
>> sender must re-send a CQE if the target must get notified.
>
> Hi Jens,
> Since we are have open/accept direct feature, this may be useful. But I
> just can't think of a real case that people use two rings and need to do
> operations to same fd.
The two cases that people bring up as missing for direct descriptors
that you can currently do with a real fd is:
1) Server needs to be shutdown or restarted, pass file descriptors to
another onei
2) Backend is split, and one accepts connections, while others then get
the fd passed and handle the actual connection.
Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.
> Assume there are real cases, then filling a cqe is necessary since users
> need to first make sure the desired fd is registered before doing
> something to it.
Right, my quesion here was really whether it should be bundled with the
IORING_MSG_SEND_FD operation, or whether the issuer of that should also
be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
target ring to notify it if the fact that an fd has been sent to it.
If the operation is split like the latter, then it makes the error
handling a bit easier as we eliminate one failing part of the existing
MSG_SEND_FD.
You could then also pass a number of descriptors and then post a single
OP_MSG_SEND with some data that tells you which descriptors were passed.
For the basic use case of just passing a single descriptor, what the
code currently does is probably the sanest approach - send the fd, post
a cqe.
> A downside is users have to take care to do fd delivery especially
> when slot resource is in short supply in target_ctx.
>
> ctx target_ctx
> msg1(fd1 to target slot x)
>
> msg2(fd2 to target slot x)
>
> get cqe of msg1
> do something to fd1 by access slot x
>
>
> the msg2 is issued not at the right time. In short not only ctx needs to
> fill a cqe to target_ctx to inform that the file has been registered
> but also the target_ctx has to tell ctx that "my slot x is free now
> for you to deliver fd". So I guess users are inclined to allocate a
> big fixed table and deliver fds to target_ctx in different slots,
> Which is ok but anyway a limitation.
I suspect the common use case would be to use the alloc feature, since
the sender generally has no way of knowing which slots are free on the
target ring.
>> +static int io_double_lock_ctx(struct io_ring_ctx *ctx,
>> + struct io_ring_ctx *octx,
>> + unsigned int issue_flags)
>> +{
>> + /*
>> + * To ensure proper ordering between the two ctxs, we can only
>> + * attempt a trylock on the target. If that fails and we already have
>> + * the source ctx lock, punt to io-wq.
>> + */
>> + if (!(issue_flags & IO_URING_F_UNLOCKED)) {
>> + if (!mutex_trylock(&octx->uring_lock))
>> + return -EAGAIN;
>> + return 0;
>> + }
>> +
>> + /* Always grab smallest value ctx first. */
>> + if (ctx < octx) {
>> + mutex_lock(&ctx->uring_lock);
>> + mutex_lock(&octx->uring_lock);
>> + } else if (ctx > octx) {
>
>
> Would a simple else work?
> if (a < b) {
> lock(a); lock(b);
> } else {
> lock(b);lock(a);
> }
>
> since a doesn't equal b
Yes that'd be fine, I think I added the a == b pre-check a bit later in
the process. I'll change this to an else instead.
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 11:34 ` Jens Axboe
@ 2022-06-18 12:47 ` Hao Xu
2022-06-18 12:50 ` Jens Axboe
0 siblings, 1 reply; 12+ messages in thread
From: Hao Xu @ 2022-06-18 12:47 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: asml.silence
On 6/18/22 19:34, Jens Axboe wrote:
> On 6/18/22 5:02 AM, Hao Xu wrote:
>> On 6/17/22 21:45, Jens Axboe wrote:
>>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>>> Extend that support to also allow sending a fixed file descriptor to
>>> that ring, enabling one ring to pass a registered descriptor to another
>>> one.
>>>
>>> Arguments are extended to pass in:
>>>
>>> sqe->addr3 fixed file slot in source ring
>>> sqe->file_index fixed file slot in destination ring
>>>
>>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>>> to the above arguments.
>>>
>>> Undecided:
>>> - Should we post a cqe with the send, or require that the sender
>>> just link a separate IORING_OP_MSG_RING? This makes error
>>> handling easier, as we cannot easily retract the installed
>>> file descriptor if the target CQ ring is full. Right now we do
>>> fill a CQE. If the request completes with -EOVERFLOW, then the
>>> sender must re-send a CQE if the target must get notified.
>>
>> Hi Jens,
>> Since we are have open/accept direct feature, this may be useful. But I
>> just can't think of a real case that people use two rings and need to do
>> operations to same fd.
>
> The two cases that people bring up as missing for direct descriptors
> that you can currently do with a real fd is:
>
> 1) Server needs to be shutdown or restarted, pass file descriptors to
> another onei
>
> 2) Backend is split, and one accepts connections, while others then get
> the fd passed and handle the actual connection.
>
> Both of those are classic SCM_RIGHTS use cases, and it's not possible to
> support them with direct descriptors today.
I see, thanks for detail explanation.
>
>> Assume there are real cases, then filling a cqe is necessary since users
>> need to first make sure the desired fd is registered before doing
>> something to it.
>
> Right, my quesion here was really whether it should be bundled with the
> IORING_MSG_SEND_FD operation, or whether the issuer of that should also
> be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
> target ring to notify it if the fact that an fd has been sent to it.
>
> If the operation is split like the latter, then it makes the error
> handling a bit easier as we eliminate one failing part of the existing
> MSG_SEND_FD.
>
> You could then also pass a number of descriptors and then post a single
> OP_MSG_SEND with some data that tells you which descriptors were passed.
>
> For the basic use case of just passing a single descriptor, what the
> code currently does is probably the sanest approach - send the fd, post
> a cqe.
>
>> A downside is users have to take care to do fd delivery especially
>> when slot resource is in short supply in target_ctx.
>>
>> ctx target_ctx
>> msg1(fd1 to target slot x)
>>
>> msg2(fd2 to target slot x)
>>
>> get cqe of msg1
>> do something to fd1 by access slot x
>>
>>
>> the msg2 is issued not at the right time. In short not only ctx needs to
>> fill a cqe to target_ctx to inform that the file has been registered
>> but also the target_ctx has to tell ctx that "my slot x is free now
>> for you to deliver fd". So I guess users are inclined to allocate a
>> big fixed table and deliver fds to target_ctx in different slots,
>> Which is ok but anyway a limitation.
>
> I suspect the common use case would be to use the alloc feature, since
> the sender generally has no way of knowing which slots are free on the
> target ring.
I mean the sender may not easily know which value to set for
msg->dst_fd not about the alloc feature.
>
>>> +static int io_double_lock_ctx(struct io_ring_ctx *ctx,
>>> + struct io_ring_ctx *octx,
>>> + unsigned int issue_flags)
>>> +{
>>> + /*
>>> + * To ensure proper ordering between the two ctxs, we can only
>>> + * attempt a trylock on the target. If that fails and we already have
>>> + * the source ctx lock, punt to io-wq.
>>> + */
>>> + if (!(issue_flags & IO_URING_F_UNLOCKED)) {
>>> + if (!mutex_trylock(&octx->uring_lock))
>>> + return -EAGAIN;
>>> + return 0;
>>> + }
>>> +
>>> + /* Always grab smallest value ctx first. */
>>> + if (ctx < octx) {
>>> + mutex_lock(&ctx->uring_lock);
>>> + mutex_lock(&octx->uring_lock);
>>> + } else if (ctx > octx) {
>>
>>
>> Would a simple else work?
>> if (a < b) {
>> lock(a); lock(b);
>> } else {
>> lock(b);lock(a);
>> }
>>
>> since a doesn't equal b
>
> Yes that'd be fine, I think I added the a == b pre-check a bit later in
> the process. I'll change this to an else instead.
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 12:47 ` Hao Xu
@ 2022-06-18 12:50 ` Jens Axboe
2022-06-18 13:09 ` Hao Xu
0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-18 12:50 UTC (permalink / raw)
To: Hao Xu, io-uring; +Cc: asml.silence
On 6/18/22 6:47 AM, Hao Xu wrote:
> On 6/18/22 19:34, Jens Axboe wrote:
>> On 6/18/22 5:02 AM, Hao Xu wrote:
>>> On 6/17/22 21:45, Jens Axboe wrote:
>>>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>>>> Extend that support to also allow sending a fixed file descriptor to
>>>> that ring, enabling one ring to pass a registered descriptor to another
>>>> one.
>>>>
>>>> Arguments are extended to pass in:
>>>>
>>>> sqe->addr3 fixed file slot in source ring
>>>> sqe->file_index fixed file slot in destination ring
>>>>
>>>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>>>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>>>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>>>> to the above arguments.
>>>>
>>>> Undecided:
>>>> - Should we post a cqe with the send, or require that the sender
>>>> just link a separate IORING_OP_MSG_RING? This makes error
>>>> handling easier, as we cannot easily retract the installed
>>>> file descriptor if the target CQ ring is full. Right now we do
>>>> fill a CQE. If the request completes with -EOVERFLOW, then the
>>>> sender must re-send a CQE if the target must get notified.
>>>
>>> Hi Jens,
>>> Since we are have open/accept direct feature, this may be useful. But I
>>> just can't think of a real case that people use two rings and need to do
>>> operations to same fd.
>>
>> The two cases that people bring up as missing for direct descriptors
>> that you can currently do with a real fd is:
>>
>> 1) Server needs to be shutdown or restarted, pass file descriptors to
>> another onei
>>
>> 2) Backend is split, and one accepts connections, while others then get
>> the fd passed and handle the actual connection.
>>
>> Both of those are classic SCM_RIGHTS use cases, and it's not possible to
>> support them with direct descriptors today.
>
> I see, thanks for detail explanation.
I should put that in the commit message in fact. Will do so.
>>> Assume there are real cases, then filling a cqe is necessary since users
>>> need to first make sure the desired fd is registered before doing
>>> something to it.
>>
>> Right, my quesion here was really whether it should be bundled with the
>> IORING_MSG_SEND_FD operation, or whether the issuer of that should also
>> be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
>> target ring to notify it if the fact that an fd has been sent to it.
>>
>> If the operation is split like the latter, then it makes the error
>> handling a bit easier as we eliminate one failing part of the existing
>> MSG_SEND_FD.
>>
>> You could then also pass a number of descriptors and then post a single
>> OP_MSG_SEND with some data that tells you which descriptors were passed.
>>
>> For the basic use case of just passing a single descriptor, what the
>> code currently does is probably the sanest approach - send the fd, post
>> a cqe.
>>
>>> A downside is users have to take care to do fd delivery especially
>>> when slot resource is in short supply in target_ctx.
>>>
>>> ctx target_ctx
>>> msg1(fd1 to target slot x)
>>>
>>> msg2(fd2 to target slot x)
>>>
>>> get cqe of msg1
>>> do something to fd1 by access slot x
>>>
>>>
>>> the msg2 is issued not at the right time. In short not only ctx needs to
>>> fill a cqe to target_ctx to inform that the file has been registered
>>> but also the target_ctx has to tell ctx that "my slot x is free now
>>> for you to deliver fd". So I guess users are inclined to allocate a
>>> big fixed table and deliver fds to target_ctx in different slots,
>>> Which is ok but anyway a limitation.
>>
>> I suspect the common use case would be to use the alloc feature, since
>> the sender generally has no way of knowing which slots are free on the
>> target ring.
>
> I mean the sender may not easily know which value to set for
> msg->dst_fd not about the alloc feature.
But isn't that the same? The sender may indeed not have any clue, so the
expected use case is to say "don't care where it ends up, just give me a
free slot".
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 12:50 ` Jens Axboe
@ 2022-06-18 13:09 ` Hao Xu
2022-06-18 13:16 ` Jens Axboe
0 siblings, 1 reply; 12+ messages in thread
From: Hao Xu @ 2022-06-18 13:09 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: asml.silence
On 6/18/22 20:50, Jens Axboe wrote:
> On 6/18/22 6:47 AM, Hao Xu wrote:
>> On 6/18/22 19:34, Jens Axboe wrote:
>>> On 6/18/22 5:02 AM, Hao Xu wrote:
>>>> On 6/17/22 21:45, Jens Axboe wrote:
>>>>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>>>>> Extend that support to also allow sending a fixed file descriptor to
>>>>> that ring, enabling one ring to pass a registered descriptor to another
>>>>> one.
>>>>>
>>>>> Arguments are extended to pass in:
>>>>>
>>>>> sqe->addr3 fixed file slot in source ring
>>>>> sqe->file_index fixed file slot in destination ring
>>>>>
>>>>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>>>>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>>>>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>>>>> to the above arguments.
>>>>>
>>>>> Undecided:
>>>>> - Should we post a cqe with the send, or require that the sender
>>>>> just link a separate IORING_OP_MSG_RING? This makes error
>>>>> handling easier, as we cannot easily retract the installed
>>>>> file descriptor if the target CQ ring is full. Right now we do
>>>>> fill a CQE. If the request completes with -EOVERFLOW, then the
>>>>> sender must re-send a CQE if the target must get notified.
>>>>
>>>> Hi Jens,
>>>> Since we are have open/accept direct feature, this may be useful. But I
>>>> just can't think of a real case that people use two rings and need to do
>>>> operations to same fd.
>>>
>>> The two cases that people bring up as missing for direct descriptors
>>> that you can currently do with a real fd is:
>>>
>>> 1) Server needs to be shutdown or restarted, pass file descriptors to
>>> another onei
>>>
>>> 2) Backend is split, and one accepts connections, while others then get
>>> the fd passed and handle the actual connection.
>>>
>>> Both of those are classic SCM_RIGHTS use cases, and it's not possible to
>>> support them with direct descriptors today.
>>
>> I see, thanks for detail explanation.
>
> I should put that in the commit message in fact. Will do so.
>
>>>> Assume there are real cases, then filling a cqe is necessary since users
>>>> need to first make sure the desired fd is registered before doing
>>>> something to it.
>>>
>>> Right, my quesion here was really whether it should be bundled with the
>>> IORING_MSG_SEND_FD operation, or whether the issuer of that should also
>>> be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
>>> target ring to notify it if the fact that an fd has been sent to it.
>>>
>>> If the operation is split like the latter, then it makes the error
>>> handling a bit easier as we eliminate one failing part of the existing
>>> MSG_SEND_FD.
>>>
>>> You could then also pass a number of descriptors and then post a single
>>> OP_MSG_SEND with some data that tells you which descriptors were passed.
[1]
>>>
>>> For the basic use case of just passing a single descriptor, what the
>>> code currently does is probably the sanest approach - send the fd, post
>>> a cqe.
I think it's fine to keep it like this, since we can achieve [1] by a
GROUP_DELIVER flag and set cqe_skip flag for send msg request when it
turns out [1] is indeed necessary.
>>>
>>>> A downside is users have to take care to do fd delivery especially
>>>> when slot resource is in short supply in target_ctx.
>>>>
>>>> ctx target_ctx
>>>> msg1(fd1 to target slot x)
>>>>
>>>> msg2(fd2 to target slot x)
>>>>
>>>> get cqe of msg1
>>>> do something to fd1 by access slot x
>>>>
>>>>
>>>> the msg2 is issued not at the right time. In short not only ctx needs to
>>>> fill a cqe to target_ctx to inform that the file has been registered
>>>> but also the target_ctx has to tell ctx that "my slot x is free now
>>>> for you to deliver fd". So I guess users are inclined to allocate a
>>>> big fixed table and deliver fds to target_ctx in different slots,
>>>> Which is ok but anyway a limitation.
>>>
>>> I suspect the common use case would be to use the alloc feature, since
>>> the sender generally has no way of knowing which slots are free on the
>>> target ring.
>>
>> I mean the sender may not easily know which value to set for
>> msg->dst_fd not about the alloc feature.
>
> But isn't that the same? The sender may indeed not have any clue, so the
> expected use case is to say "don't care where it ends up, just give me a
> free slot".
>
Ah, yes, I read your previous words wrong.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 13:09 ` Hao Xu
@ 2022-06-18 13:16 ` Jens Axboe
2022-06-18 13:27 ` Hao Xu
0 siblings, 1 reply; 12+ messages in thread
From: Jens Axboe @ 2022-06-18 13:16 UTC (permalink / raw)
To: Hao Xu, io-uring; +Cc: asml.silence
On 6/18/22 7:09 AM, Hao Xu wrote:
> On 6/18/22 20:50, Jens Axboe wrote:
>> On 6/18/22 6:47 AM, Hao Xu wrote:
>>> On 6/18/22 19:34, Jens Axboe wrote:
>>>> On 6/18/22 5:02 AM, Hao Xu wrote:
>>>>> On 6/17/22 21:45, Jens Axboe wrote:
>>>>>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>>>>>> Extend that support to also allow sending a fixed file descriptor to
>>>>>> that ring, enabling one ring to pass a registered descriptor to another
>>>>>> one.
>>>>>>
>>>>>> Arguments are extended to pass in:
>>>>>>
>>>>>> sqe->addr3 fixed file slot in source ring
>>>>>> sqe->file_index fixed file slot in destination ring
>>>>>>
>>>>>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>>>>>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>>>>>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>>>>>> to the above arguments.
>>>>>>
>>>>>> Undecided:
>>>>>> - Should we post a cqe with the send, or require that the sender
>>>>>> just link a separate IORING_OP_MSG_RING? This makes error
>>>>>> handling easier, as we cannot easily retract the installed
>>>>>> file descriptor if the target CQ ring is full. Right now we do
>>>>>> fill a CQE. If the request completes with -EOVERFLOW, then the
>>>>>> sender must re-send a CQE if the target must get notified.
>>>>>
>>>>> Hi Jens,
>>>>> Since we are have open/accept direct feature, this may be useful. But I
>>>>> just can't think of a real case that people use two rings and need to do
>>>>> operations to same fd.
>>>>
>>>> The two cases that people bring up as missing for direct descriptors
>>>> that you can currently do with a real fd is:
>>>>
>>>> 1) Server needs to be shutdown or restarted, pass file descriptors to
>>>> another onei
>>>>
>>>> 2) Backend is split, and one accepts connections, while others then get
>>>> the fd passed and handle the actual connection.
>>>>
>>>> Both of those are classic SCM_RIGHTS use cases, and it's not possible to
>>>> support them with direct descriptors today.
>>>
>>> I see, thanks for detail explanation.
>>
>> I should put that in the commit message in fact. Will do so.
>>
>>>>> Assume there are real cases, then filling a cqe is necessary since users
>>>>> need to first make sure the desired fd is registered before doing
>>>>> something to it.
>>>>
>>>> Right, my quesion here was really whether it should be bundled with the
>>>> IORING_MSG_SEND_FD operation, or whether the issuer of that should also
>>>> be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
>>>> target ring to notify it if the fact that an fd has been sent to it.
>>>>
>>>> If the operation is split like the latter, then it makes the error
>>>> handling a bit easier as we eliminate one failing part of the existing
>>>> MSG_SEND_FD.
>>>>
>>>> You could then also pass a number of descriptors and then post a single
>>>> OP_MSG_SEND with some data that tells you which descriptors were passed.
>
> [1]
>
>>>>
>>>> For the basic use case of just passing a single descriptor, what the
>>>> code currently does is probably the sanest approach - send the fd, post
>>>> a cqe.
>
> I think it's fine to keep it like this, since we can achieve [1] by a
> GROUP_DELIVER flag and set cqe_skip flag for send msg request when it
> turns out [1] is indeed necessary.
The expected use case is probably CQE_SKIP for using this, as the sender
doesn't care about being notified about a successful send. But for the
target CQE, we'd then need to either have CQE_SKIP implying that we
should skip CQE delivery there too, or we'd need to add an
IORING_OP_MSG_RING flag for that. I think the latter is the cleaner
approach, and it would indeed then allow both use cases. If you're
sending a bunch of fds and would prefer to notify with a single
OP_MSG_RING when they are done, then you'd set that OP_MSG_RING flag
that says "don't post a CQE to the target".
Hence my proposal would be to keep the CQE delivery by default as it
stands in the patch, and add a flag for controlling whether or not
OP_MSG_RING with MSG_SEND posts a CQE to the target ring or not.
Agree?
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-18 13:16 ` Jens Axboe
@ 2022-06-18 13:27 ` Hao Xu
0 siblings, 0 replies; 12+ messages in thread
From: Hao Xu @ 2022-06-18 13:27 UTC (permalink / raw)
To: Jens Axboe, io-uring; +Cc: asml.silence
On 6/18/22 21:16, Jens Axboe wrote:
> On 6/18/22 7:09 AM, Hao Xu wrote:
>> On 6/18/22 20:50, Jens Axboe wrote:
>>> On 6/18/22 6:47 AM, Hao Xu wrote:
>>>> On 6/18/22 19:34, Jens Axboe wrote:
>>>>> On 6/18/22 5:02 AM, Hao Xu wrote:
>>>>>> On 6/17/22 21:45, Jens Axboe wrote:
>>>>>>> With IORING_OP_MSG_RING, one ring can send a message to another ring.
>>>>>>> Extend that support to also allow sending a fixed file descriptor to
>>>>>>> that ring, enabling one ring to pass a registered descriptor to another
>>>>>>> one.
>>>>>>>
>>>>>>> Arguments are extended to pass in:
>>>>>>>
>>>>>>> sqe->addr3 fixed file slot in source ring
>>>>>>> sqe->file_index fixed file slot in destination ring
>>>>>>>
>>>>>>> IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
>>>>>>> If set to zero (or IORING_MSG_DATA), it sends just a message like before.
>>>>>>> If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
>>>>>>> to the above arguments.
>>>>>>>
>>>>>>> Undecided:
>>>>>>> - Should we post a cqe with the send, or require that the sender
>>>>>>> just link a separate IORING_OP_MSG_RING? This makes error
>>>>>>> handling easier, as we cannot easily retract the installed
>>>>>>> file descriptor if the target CQ ring is full. Right now we do
>>>>>>> fill a CQE. If the request completes with -EOVERFLOW, then the
>>>>>>> sender must re-send a CQE if the target must get notified.
>>>>>>
>>>>>> Hi Jens,
>>>>>> Since we are have open/accept direct feature, this may be useful. But I
>>>>>> just can't think of a real case that people use two rings and need to do
>>>>>> operations to same fd.
>>>>>
>>>>> The two cases that people bring up as missing for direct descriptors
>>>>> that you can currently do with a real fd is:
>>>>>
>>>>> 1) Server needs to be shutdown or restarted, pass file descriptors to
>>>>> another onei
>>>>>
>>>>> 2) Backend is split, and one accepts connections, while others then get
>>>>> the fd passed and handle the actual connection.
>>>>>
>>>>> Both of those are classic SCM_RIGHTS use cases, and it's not possible to
>>>>> support them with direct descriptors today.
>>>>
>>>> I see, thanks for detail explanation.
>>>
>>> I should put that in the commit message in fact. Will do so.
>>>
>>>>>> Assume there are real cases, then filling a cqe is necessary since users
>>>>>> need to first make sure the desired fd is registered before doing
>>>>>> something to it.
>>>>>
>>>>> Right, my quesion here was really whether it should be bundled with the
>>>>> IORING_MSG_SEND_FD operation, or whether the issuer of that should also
>>>>> be responsible for then posting a "normal" IORING_OP_MSG_SEND to the
>>>>> target ring to notify it if the fact that an fd has been sent to it.
>>>>>
>>>>> If the operation is split like the latter, then it makes the error
>>>>> handling a bit easier as we eliminate one failing part of the existing
>>>>> MSG_SEND_FD.
>>>>>
>>>>> You could then also pass a number of descriptors and then post a single
>>>>> OP_MSG_SEND with some data that tells you which descriptors were passed.
>>
>> [1]
>>
>>>>>
>>>>> For the basic use case of just passing a single descriptor, what the
>>>>> code currently does is probably the sanest approach - send the fd, post
>>>>> a cqe.
>>
>> I think it's fine to keep it like this, since we can achieve [1] by a
>> GROUP_DELIVER flag and set cqe_skip flag for send msg request when it
>> turns out [1] is indeed necessary.
>
> The expected use case is probably CQE_SKIP for using this, as the sender
> doesn't care about being notified about a successful send. But for the
> target CQE, we'd then need to either have CQE_SKIP implying that we
> should skip CQE delivery there too, or we'd need to add an
Yea, that's what I meant, CQE_SKIP for the target_ctx cqe.
> IORING_OP_MSG_RING flag for that. I think the latter is the cleaner
> approach, and it would indeed then allow both use cases. If you're
> sending a bunch of fds and would prefer to notify with a single
> OP_MSG_RING when they are done, then you'd set that OP_MSG_RING flag
> that says "don't post a CQE to the target".
>
> Hence my proposal would be to keep the CQE delivery by default as it
> stands in the patch, and add a flag for controlling whether or not
> OP_MSG_RING with MSG_SEND posts a CQE to the target ring or not.
>
> Agree?
>
Sounds good.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-19 1:59 [PATCHSET v2 for-next 0/2] Add direct descriptor ring passing Jens Axboe
@ 2022-06-19 1:59 ` Jens Axboe
0 siblings, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-19 1:59 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, Jens Axboe
With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.
Arguments are extended to pass in:
sqe->addr3 fixed file slot in source ring
sqe->file_index fixed file slot in destination ring
IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.
Two common use cases for this are:
1) Server needs to be shutdown or restarted, pass file descriptors to
another onei
2) Backend is split, and one accepts connections, while others then get
the fd passed and handle the actual connection.
Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.
By default, this will post a CQE to the target ring, similarly to how
IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message
is posted to the target ring. The issuer is expected to notify the
receiver side separately.
Signed-off-by: Jens Axboe <[email protected]>
---
include/uapi/linux/io_uring.h | 17 +++++
io_uring/msg_ring.c | 130 ++++++++++++++++++++++++++++++++--
2 files changed, 140 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8715f0942ec2..15e54e633ee2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -47,6 +47,7 @@ struct io_uring_sqe {
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
+ __u32 msg_ring_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -264,6 +265,22 @@ enum io_uring_op {
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+ IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
+ IORING_MSG_SEND_FD, /* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
+ * applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP (1U << 0)
+
/*
* IO completion data structure (Completion Queue Entry)
*/
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be2349652..939205b30c8b 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,162 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
#include "msg_ring.h"
struct io_msg {
struct file *file;
u64 user_data;
u32 len;
+ u32 cmd;
+ u32 src_fd;
+ u32 dst_fd;
+ u32 flags;
};
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+
+ if (msg->src_fd || msg->dst_fd || msg->flags)
+ return -EINVAL;
+
+ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ return 0;
+
+ return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+ mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ /*
+ * To ensure proper ordering between the two ctxs, we can only
+ * attempt a trylock on the target. If that fails and we already have
+ * the source ctx lock, punt to io-wq.
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ if (!mutex_trylock(&octx->uring_lock))
+ return -EAGAIN;
+ return 0;
+ }
+
+ /* Always grab smallest value ctx first. We know ctx != octx. */
+ if (ctx < octx) {
+ mutex_lock(&ctx->uring_lock);
+ mutex_lock(&octx->uring_lock);
+ } else {
+ mutex_lock(&octx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long file_ptr;
+ struct file *src_file;
+ int ret;
+
+ if (target_ctx == ctx)
+ return -EINVAL;
+
+ ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+ if (unlikely(ret))
+ return ret;
+
+ ret = -EBADF;
+ if (unlikely(msg->src_fd >= ctx->nr_user_files))
+ goto out_unlock;
+
+ msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+ src_file = (struct file *) (file_ptr & FFS_MASK);
+ get_file(src_file);
+
+ ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+ if (ret < 0) {
+ fput(src_file);
+ goto out_unlock;
+ }
+
+ if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ goto out_unlock;
+
+ /*
+ * If this fails, the target still received the file descriptor but
+ * wasn't notified of the fact. This means that if this request
+ * completes with -EOVERFLOW, then the sender must ensure that a
+ * later IORING_OP_MSG_RING delivers the message.
+ */
+ if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ ret = -EOVERFLOW;
+out_unlock:
+ io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+ return ret;
+}
+
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
- sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL;
msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len);
+ msg->cmd = READ_ONCE(sqe->addr);
+ msg->src_fd = READ_ONCE(sqe->addr3);
+ msg->dst_fd = READ_ONCE(sqe->file_index);
+ msg->flags = READ_ONCE(sqe->msg_ring_flags);
+ if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+ return -EINVAL;
+
return 0;
}
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- struct io_ring_ctx *target_ctx;
int ret;
ret = -EBADFD;
if (!io_is_uring_fops(req->file))
goto done;
- ret = -EOVERFLOW;
- target_ctx = req->file->private_data;
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
- ret = 0;
+ switch (msg->cmd) {
+ case IORING_MSG_DATA:
+ ret = io_msg_ring_data(req);
+ break;
+ case IORING_MSG_SEND_FD:
+ ret = io_msg_send_fd(req, issue_flags);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
done:
if (ret < 0)
--
2.35.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCHSET v3] Add direct descriptor ring passing
@ 2022-06-22 23:16 Jens Axboe
2022-06-22 23:16 ` [PATCH 1/2] io_uring: split out fixed file installation and removal Jens Axboe
2022-06-22 23:16 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
0 siblings, 2 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 23:16 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, carter.li, hao.xu
Hi,
Hi,
One of the things we currently cannot do with direct descriptors is pass
it to another application or ring. This adds support for doing so, through
the IORING_OP_MSG_RING ring-to-ring messaging opcode.
Changes since v2:
- Add flag for controlling whether to post a CQE to the target ring or
not.
--
Jens Axboe
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 1/2] io_uring: split out fixed file installation and removal
2022-06-22 23:16 [PATCHSET v3] Add direct descriptor ring passing Jens Axboe
@ 2022-06-22 23:16 ` Jens Axboe
2022-06-22 23:16 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 23:16 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, carter.li, hao.xu, Jens Axboe
Put it with the filetable code, which is where it belongs. While doing
so, have the helpers take a ctx rather than an io_kiocb. It doesn't make
sense to use a request, as it's not an operation on the request itself.
It applies to the ring itself.
Signed-off-by: Jens Axboe <[email protected]>
---
io_uring/filetable.c | 72 +++++++++++++++++++++++++++++++++-----------
io_uring/filetable.h | 3 ++
io_uring/openclose.c | 35 +++------------------
io_uring/openclose.h | 2 +-
io_uring/rsrc.c | 2 +-
5 files changed, 63 insertions(+), 51 deletions(-)
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 534e1a3c625d..abaa5ba7f655 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -58,11 +58,10 @@ void io_free_file_tables(struct io_file_table *table)
table->bitmap = NULL;
}
-static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
- unsigned int issue_flags, u32 slot_index)
+static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
+ u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
- struct io_ring_ctx *ctx = req->ctx;
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret;
@@ -108,6 +107,26 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
return ret;
}
+int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
+ unsigned int file_slot)
+{
+ bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+ int ret;
+
+ if (alloc_slot) {
+ ret = io_file_bitmap_get(ctx);
+ if (unlikely(ret < 0))
+ return ret;
+ file_slot = ret;
+ } else {
+ file_slot--;
+ }
+
+ ret = io_install_fixed_file(ctx, file, file_slot);
+ if (!ret && alloc_slot)
+ ret = file_slot;
+ return ret;
+}
/*
* Note when io_fixed_fd_install() returns error value, it will ensure
* fput() is called correspondingly.
@@ -115,27 +134,44 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot)
{
- bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
struct io_ring_ctx *ctx = req->ctx;
int ret;
io_ring_submit_lock(ctx, issue_flags);
-
- if (alloc_slot) {
- ret = io_file_bitmap_get(ctx);
- if (unlikely(ret < 0))
- goto err;
- file_slot = ret;
- } else {
- file_slot--;
- }
-
- ret = io_install_fixed_file(req, file, issue_flags, file_slot);
- if (!ret && alloc_slot)
- ret = file_slot;
-err:
+ ret = __io_fixed_fd_install(ctx, file, file_slot);
io_ring_submit_unlock(ctx, issue_flags);
+
if (unlikely(ret < 0))
fput(file);
return ret;
}
+
+int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
+{
+ struct io_fixed_file *file_slot;
+ struct file *file;
+ int ret;
+
+ if (unlikely(!ctx->file_data))
+ return -ENXIO;
+ if (offset >= ctx->nr_user_files)
+ return -EINVAL;
+ ret = io_rsrc_node_switch_start(ctx);
+ if (ret)
+ return ret;
+
+ offset = array_index_nospec(offset, ctx->nr_user_files);
+ file_slot = io_fixed_file_slot(&ctx->file_table, offset);
+ if (!file_slot->file_ptr)
+ return -EBADF;
+
+ file = (struct file *)(file_slot->file_ptr & FFS_MASK);
+ ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
+ if (ret)
+ return ret;
+
+ file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, offset);
+ io_rsrc_node_switch(ctx, ctx->file_data);
+ return 0;
+}
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index fb5a274c08ff..79eb50c1980e 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -29,6 +29,9 @@ void io_free_file_tables(struct io_file_table *table);
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot);
+int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file,
+ unsigned int file_slot);
+int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
unsigned int io_file_get_flags(struct file *file);
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index 099a5ec84dfd..d1818ec9169b 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -173,42 +173,15 @@ void io_open_cleanup(struct io_kiocb *req)
putname(open->filename);
}
-int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset)
{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_fixed_file *file_slot;
- struct file *file;
int ret;
io_ring_submit_lock(ctx, issue_flags);
- ret = -ENXIO;
- if (unlikely(!ctx->file_data))
- goto out;
- ret = -EINVAL;
- if (offset >= ctx->nr_user_files)
- goto out;
- ret = io_rsrc_node_switch_start(ctx);
- if (ret)
- goto out;
-
- offset = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, offset);
- ret = -EBADF;
- if (!file_slot->file_ptr)
- goto out;
-
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
- if (ret)
- goto out;
-
- file_slot->file_ptr = 0;
- io_file_bitmap_clear(&ctx->file_table, offset);
- io_rsrc_node_switch(ctx, ctx->file_data);
- ret = 0;
-out:
+ ret = io_fixed_fd_remove(ctx, offset);
io_ring_submit_unlock(ctx, issue_flags);
+
return ret;
}
@@ -216,7 +189,7 @@ static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_close *close = io_kiocb_to_cmd(req);
- return __io_close_fixed(req, issue_flags, close->file_slot - 1);
+ return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1);
}
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 9f578f3fad87..4b1c28d3a66c 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags,
+int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags,
unsigned int offset);
int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 3a2a5ef263f0..c49217f9cfc6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -700,7 +700,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
if (ret < 0)
break;
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
- __io_close_fixed(req, issue_flags, ret);
+ __io_close_fixed(req->ctx, issue_flags, ret);
ret = -EFAULT;
break;
}
--
2.35.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH 2/2] io_uring: add support for passing fixed file descriptors
2022-06-22 23:16 [PATCHSET v3] Add direct descriptor ring passing Jens Axboe
2022-06-22 23:16 ` [PATCH 1/2] io_uring: split out fixed file installation and removal Jens Axboe
@ 2022-06-22 23:16 ` Jens Axboe
1 sibling, 0 replies; 12+ messages in thread
From: Jens Axboe @ 2022-06-22 23:16 UTC (permalink / raw)
To: io-uring; +Cc: asml.silence, carter.li, hao.xu, Jens Axboe
With IORING_OP_MSG_RING, one ring can send a message to another ring.
Extend that support to also allow sending a fixed file descriptor to
that ring, enabling one ring to pass a registered descriptor to another
one.
Arguments are extended to pass in:
sqe->addr3 fixed file slot in source ring
sqe->file_index fixed file slot in destination ring
IORING_OP_MSG_RING is extended to take a command argument in sqe->addr.
If set to zero (or IORING_MSG_DATA), it sends just a message like before.
If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according
to the above arguments.
Two common use cases for this are:
1) Server needs to be shutdown or restarted, pass file descriptors to
another onei
2) Backend is split, and one accepts connections, while others then get
the fd passed and handle the actual connection.
Both of those are classic SCM_RIGHTS use cases, and it's not possible to
support them with direct descriptors today.
By default, this will post a CQE to the target ring, similarly to how
IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message
is posted to the target ring. The issuer is expected to notify the
receiver side separately.
Signed-off-by: Jens Axboe <[email protected]>
---
include/uapi/linux/io_uring.h | 17 +++++
io_uring/msg_ring.c | 130 ++++++++++++++++++++++++++++++++--
2 files changed, 140 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8715f0942ec2..15e54e633ee2 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -47,6 +47,7 @@ struct io_uring_sqe {
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
+ __u32 msg_ring_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -264,6 +265,22 @@ enum io_uring_op {
*/
#define IORING_ACCEPT_MULTISHOT (1U << 0)
+/*
+ * IORING_OP_MSG_RING command types, stored in sqe->addr
+ */
+enum {
+ IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */
+ IORING_MSG_SEND_FD, /* send a registered fd to another ring */
+};
+
+/*
+ * IORING_OP_MSG_RING flags (sqe->msg_ring_flags)
+ *
+ * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not
+ * applicable for IORING_MSG_DATA, obviously.
+ */
+#define IORING_MSG_RING_CQE_SKIP (1U << 0)
+
/*
* IO completion data structure (Completion Queue Entry)
*/
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index b02be2349652..939205b30c8b 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -3,46 +3,162 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/slab.h>
+#include <linux/nospec.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "rsrc.h"
+#include "filetable.h"
#include "msg_ring.h"
struct io_msg {
struct file *file;
u64 user_data;
u32 len;
+ u32 cmd;
+ u32 src_fd;
+ u32 dst_fd;
+ u32 flags;
};
+static int io_msg_ring_data(struct io_kiocb *req)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+
+ if (msg->src_fd || msg->dst_fd || msg->flags)
+ return -EINVAL;
+
+ if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ return 0;
+
+ return -EOVERFLOW;
+}
+
+static void io_double_unlock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ if (issue_flags & IO_URING_F_UNLOCKED)
+ mutex_unlock(&ctx->uring_lock);
+ mutex_unlock(&octx->uring_lock);
+}
+
+static int io_double_lock_ctx(struct io_ring_ctx *ctx,
+ struct io_ring_ctx *octx,
+ unsigned int issue_flags)
+{
+ /*
+ * To ensure proper ordering between the two ctxs, we can only
+ * attempt a trylock on the target. If that fails and we already have
+ * the source ctx lock, punt to io-wq.
+ */
+ if (!(issue_flags & IO_URING_F_UNLOCKED)) {
+ if (!mutex_trylock(&octx->uring_lock))
+ return -EAGAIN;
+ return 0;
+ }
+
+ /* Always grab smallest value ctx first. We know ctx != octx. */
+ if (ctx < octx) {
+ mutex_lock(&ctx->uring_lock);
+ mutex_lock(&octx->uring_lock);
+ } else {
+ mutex_lock(&octx->uring_lock);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ return 0;
+}
+
+static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req);
+ struct io_ring_ctx *ctx = req->ctx;
+ unsigned long file_ptr;
+ struct file *src_file;
+ int ret;
+
+ if (target_ctx == ctx)
+ return -EINVAL;
+
+ ret = io_double_lock_ctx(ctx, target_ctx, issue_flags);
+ if (unlikely(ret))
+ return ret;
+
+ ret = -EBADF;
+ if (unlikely(msg->src_fd >= ctx->nr_user_files))
+ goto out_unlock;
+
+ msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files);
+ file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr;
+ src_file = (struct file *) (file_ptr & FFS_MASK);
+ get_file(src_file);
+
+ ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
+ if (ret < 0) {
+ fput(src_file);
+ goto out_unlock;
+ }
+
+ if (msg->flags & IORING_MSG_RING_CQE_SKIP)
+ goto out_unlock;
+
+ /*
+ * If this fails, the target still received the file descriptor but
+ * wasn't notified of the fact. This means that if this request
+ * completes with -EOVERFLOW, then the sender must ensure that a
+ * later IORING_OP_MSG_RING delivers the message.
+ */
+ if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
+ ret = -EOVERFLOW;
+out_unlock:
+ io_double_unlock_ctx(ctx, target_ctx, issue_flags);
+ return ret;
+}
+
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
- sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL;
msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len);
+ msg->cmd = READ_ONCE(sqe->addr);
+ msg->src_fd = READ_ONCE(sqe->addr3);
+ msg->dst_fd = READ_ONCE(sqe->file_index);
+ msg->flags = READ_ONCE(sqe->msg_ring_flags);
+ if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
+ return -EINVAL;
+
return 0;
}
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req);
- struct io_ring_ctx *target_ctx;
int ret;
ret = -EBADFD;
if (!io_is_uring_fops(req->file))
goto done;
- ret = -EOVERFLOW;
- target_ctx = req->file->private_data;
- if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
- ret = 0;
+ switch (msg->cmd) {
+ case IORING_MSG_DATA:
+ ret = io_msg_ring_data(req);
+ break;
+ case IORING_MSG_SEND_FD:
+ ret = io_msg_send_fd(req, issue_flags);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
done:
if (ret < 0)
--
2.35.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-06-22 23:16 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2022-06-22 23:16 [PATCHSET v3] Add direct descriptor ring passing Jens Axboe
2022-06-22 23:16 ` [PATCH 1/2] io_uring: split out fixed file installation and removal Jens Axboe
2022-06-22 23:16 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
-- strict thread matches above, loose matches on Subject: below --
2022-06-19 1:59 [PATCHSET v2 for-next 0/2] Add direct descriptor ring passing Jens Axboe
2022-06-19 1:59 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
2022-06-17 13:45 [PATCHSET RFC for-next 0/2] Add direct descriptor ring passing Jens Axboe
2022-06-17 13:45 ` [PATCH 2/2] io_uring: add support for passing fixed file descriptors Jens Axboe
2022-06-18 11:02 ` Hao Xu
2022-06-18 11:34 ` Jens Axboe
2022-06-18 12:47 ` Hao Xu
2022-06-18 12:50 ` Jens Axboe
2022-06-18 13:09 ` Hao Xu
2022-06-18 13:16 ` Jens Axboe
2022-06-18 13:27 ` Hao Xu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox