public inbox for [email protected]
 help / color / mirror / Atom feed
* RE: [PATCH 0/5] add initial io_uring_cmd support for sockets
@ 2023-05-02  9:21 Adrien Delorme
  2023-05-02 13:03 ` Pavel Begunkov
  0 siblings, 1 reply; 36+ messages in thread
From: Adrien Delorme @ 2023-05-02  9:21 UTC (permalink / raw)
  To: david.laight
  Cc: asml.silence, axboe, davem, dccp, dsahern, edumazet, io-uring,
	kuba, leit, leitao, linux-kernel, marcelo.leitner,
	matthieu.baerts, mptcp, netdev, pabeni, willemb,
	willemdebruijn.kernel

From Adrien Delorme

> From: David Ahern 
> Sent: 12 April 2023 7:39 
> > Sent: 11 April 2023 16:28
> ....
> > Christoph's patch set a few years back that removed set_fs broke the
> > ability to do in-kernel ioctl and {s,g}setsockopt calls. I did not
> > follow that change; was it a deliberate intent to not allow these
> > in-kernel calls vs wanting to remove the set_fs? e.g., can we add a
> > kioctl variant for in-kernel use of the APIs?
>
> I think that was a side effect, and with no in-tree in-kernel
> users (apart from limited calls in bpf) it was deemed acceptable.
> (It is a PITA for any code trying to use SCTP in kernel.)
>
> One problem is that not all sockopt calls pass the correct length.
> And some of them can have very long buffers.
> Not to mention the ones that are read-modify-write.
>
> A plausible solution is to pass a 'fat pointer' that contains
> some, or all, of:
>       - A userspace buffer pointer.
>       - A kernel buffer pointer.
>       - The length supplied by the user.
>       - The length of the kernel buffer.
>       = The number of bytes to copy on completion.
> For simple user requests the syscall entry/exit code
> would copy the data to a short on-stack buffer.
> Kernel users just pass the kernel address.
> Odd requests can just use the user pointer.
>
> Probably needs accessors that add in an offset.
>
> It might also be that some of the problematic sockopt
> were in decnet - now removed.

Hello everyone,

I'm currently working on an implementation of {get,set} sockopt. 
Since this thread is already talking about it, I hope that I replying at the correct place. 

My implementation is rather simple using a struct that will be used to pass the necessary info throught sqe->cmd.

Here is my implementation based of kernel version 6.3 : 

Signed-off-by: Adrien Delorme <[email protected]>

diff -uprN a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
--- a/include/uapi/linux/io_uring.h     2023-04-23 15:02:52.000000000 -0400
+++ b/include/uapi/linux/io_uring.h     2023-04-24 07:55:21.406981696 -0400
@@ -235,6 +235,25 @@ enum io_uring_op {
  */
#define IORING_URING_CMD_FIXED (1U << 0)

+/* struct io_uring_cmd->cmd_op flags for socket operations */
+#define IO_URING_CMD_OP_GETSOCKOPT 0x0
+#define IO_URING_CMD_OP_SETSOCKOPT 0x1
+
+/* Struct to pass args for IO_URING_CMD_OP_GETSOCKOPT and IO_URING_CMD_OP_SETSOCKOPT operations */
+struct args_setsockopt_uring{
+       int                             level;
+       int                     optname;
+       char __user *   user_optval;
+       int                     optlen;
+};
+
+struct args_getsockopt_uring{
+       int                             level;
+       int                     optname;
+       char __user *   user_optval;
+       int      __user *       optlen;
+};
+

/*
  * sqe->fsync_flags
diff -uprN a/net/socket.c b/net/socket.c
--- a/net/socket.c      2023-04-23 15:02:52.000000000 -0400
+++ b/net/socket.c      2023-04-24 08:06:44.800981696 -0400
@@ -108,6 +108,11 @@
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>

+#ifdef CONFIG_IO_URING
+#include <uapi/linux/io_uring.h>
+#include <linux/io_uring.h>
+#endif
+
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
@@ -132,6 +137,11 @@ static ssize_t sock_splice_read(struct f
                                struct pipe_inode_info *pipe, size_t len,
                                unsigned int flags);

+
+#ifdef CONFIG_IO_URING
+int socket_uring_cmd_handler(struct io_uring_cmd *cmd, unsigned int flags);
+#endif
+
#ifdef CONFIG_PROC_FS
static void sock_show_fdinfo(struct seq_file *m, struct file *f)
{
@@ -166,6 +176,9 @@ static const struct file_operations sock
        .splice_write = generic_splice_sendpage,
        .splice_read =  sock_splice_read,
        .show_fdinfo =  sock_show_fdinfo,
+#ifdef CONFIG_IO_URING
+       .uring_cmd = socket_uring_cmd_handler,
+#endif
};

static const char * const pf_family_names[] = {
@@ -2330,6 +2343,126 @@ SYSCALL_DEFINE5(getsockopt, int, fd, int
        return __sys_getsockopt(fd, level, optname, optval, optlen);
}

+#ifdef CONFIG_IO_URING
+
+/*
+ * Make getsockopt operation with io_uring.
+ * This fonction is based of the __sys_getsockopt without sockfd_lookup_light
+ * since io_uring retrieves it for us.
+ */
+int uring_cmd_getsockopt(struct socket *sock, int level, int optname, char __user *optval,
+               int __user *optlen)
+{
+       int err;
+       int max_optlen;
+
+       err = security_socket_getsockopt(sock, level, optname);
+       if (err)
+               goto out_put;
+
+       if (!in_compat_syscall())
+               max_optlen = BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen);
+
+       if (level == SOL_SOCKET)
+               err = sock_getsockopt(sock, level, optname, optval, optlen);
+       else if (unlikely(!sock->ops->getsockopt))
+               err = -EOPNOTSUPP;
+       else
+               err = sock->ops->getsockopt(sock, level, optname, optval,
+                                           optlen);
+
+       if (!in_compat_syscall())
+               err = BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock->sk, level, optname,
+                                                    optval, optlen, max_optlen,
+                                                    err);
+out_put:
+       return err;
+}
+
+/*
+ * Make setsockopt operation with io_uring.
+ * This fonction is based of the __sys_setsockopt without sockfd_lookup_light
+ * since io_uring retrieves it for us.
+ */
+int uring_cmd_setsockopt(struct socket *sock, int level, int optname, char *user_optval,
+               int optlen)
+{
+       sockptr_t optval = USER_SOCKPTR(user_optval);
+       char *kernel_optval = NULL;
+       int err;
+
+       if (optlen < 0)
+               return -EINVAL;
+
+       err = security_socket_setsockopt(sock, level, optname);
+       if (err)
+               goto out_put;
+
+       if (!in_compat_syscall())
+               err = BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock->sk, &level, &optname,
+                                                    user_optval, &optlen,
+                                                    &kernel_optval);
+       if (err < 0)
+               goto out_put;
+       if (err > 0) {
+               err = 0;
+               goto out_put;
+       }
+
+       if (kernel_optval)
+               optval = KERNEL_SOCKPTR(kernel_optval);
+       if (level == SOL_SOCKET && !sock_use_custom_sol_socket(sock))
+               err = sock_setsockopt(sock, level, optname, optval, optlen);
+       else if (unlikely(!sock->ops->setsockopt))
+               err = -EOPNOTSUPP;
+       else
+               err = sock->ops->setsockopt(sock, level, optname, optval,
+                                           optlen);
+       kfree(kernel_optval);
+out_put:
+       return err;
+}
+
+/*
+ * Handler uring_cmd socket file_operations.
+ *
+ * Operation code and struct are defined in /include/uapi/linux/io_uring.h
+ * The io_uring ring needs to be set with the flags : IORING_SETUP_SQE128 and IORING_SETUP_CQE32
+ *
+ */
+int socket_uring_cmd_handler(struct io_uring_cmd *cmd, unsigned int flags){
+
+       /* Retrieve socket */
+       struct socket *sock = sock_from_file(cmd->file);
+
+       if (!sock)
+               return -EINVAL;
+
+       /* Does the requested operation */
+       switch (cmd->cmd_op) {
+               case IO_URING_CMD_OP_GETSOCKOPT:
+                       struct args_getsockopt_uring *values_get = (struct args_getsockopt_uring *) cmd->cmd;
+                       return uring_cmd_getsockopt(sock,
+                                                                               values_get->level,
+                                                                               values_get->optname,
+                                                                               values_get->user_optval,
+                                                                               values_get->optlen);
+
+               case IO_URING_CMD_OP_SETSOCKOPT:
+                       struct args_setsockopt_uring *values_set = (struct args_setsockopt_uring *) cmd->cmd;
+                       return uring_cmd_setsockopt(sock,
+                                                                               values_set->level,
+                                                                               values_set->optname,
+                                                                               values_set->user_optval,
+                                                                               values_set->optlen);
+               default:
+                       break;
+
+       }
+       return -EINVAL;
+}
+#endif
+
/*
  *     Shutdown a socket.
  */

I would appreciate any feedback or advice you may have on this work. Hopefully it will be of some kind of help. Thank you for your time and consideration.

Adrien

^ permalink raw reply	[flat|nested] 36+ messages in thread
* [PATCH 0/5] add initial io_uring_cmd support for sockets
@ 2023-04-06 14:43 Breno Leitao
  2023-04-06 15:34 ` Willem de Bruijn
  2023-04-06 16:41 ` Keith Busch
  0 siblings, 2 replies; 36+ messages in thread
From: Breno Leitao @ 2023-04-06 14:43 UTC (permalink / raw)
  To: io-uring, netdev, kuba, asml.silence, axboe
  Cc: leit, edumazet, pabeni, davem, dccp, mptcp, linux-kernel,
	dsahern, willemdebruijn.kernel, matthieu.baerts, marcelo.leitner

From: Breno Leitao <[email protected]>

This patchset creates the initial plumbing for a io_uring command for
sockets.

For now, create two uring commands for sockets, SOCKET_URING_OP_SIOCOUTQ
and SOCKET_URING_OP_SIOCINQ. They are similar to ioctl operations
SIOCOUTQ and SIOCINQ. In fact, the code on the protocol side itself is
heavily based on the ioctl operations.

In order to test this code, I created a liburing test, which is
currently located at [1], and I will create a pull request once we are
good with this patch.

I've also run test/io_uring_passthrough to make sure the first patch
didn't regressed the NVME passthrough path.

This patchset is a RFC for two different reasons:
  * It changes slighlty on how IO uring command operates. I.e, we are
    now passing the whole SQE to the io_uring_cmd callback (instead of
    an opaque buffer). This seems to be more palatable instead of
    creating some custom structure just to fit small parameters, as in
    SOCKET_URING_OP_SIOC{IN,OUT}Q. Is this OK?

  * Pavel has some ideas about the SQE->cmd_op field, so, we can start
    discussing it here.

This work is heavily inspired by Jens Axboe's initial implementation.

[1] https://github.com/leitao/liburing/blob/master/test/socket-io-cmd.c

Breno Leitao (4):
  net: wire up support for file_operations->uring_cmd()
  net: add uring_cmd callback to UDP
  net: add uring_cmd callback to TCP
  net: add uring_cmd callback to raw "protocol"

 include/linux/net.h      |  2 ++
 include/net/raw.h        |  3 +++
 include/net/sock.h       |  6 ++++++
 include/net/tcp.h        |  2 ++
 include/net/udp.h        |  2 ++
 include/uapi/linux/net.h |  5 +++++
 net/core/sock.c          | 17 +++++++++++++++--
 net/dccp/ipv4.c          |  1 +
 net/ipv4/af_inet.c       |  3 +++
 net/ipv4/raw.c           | 26 ++++++++++++++++++++++++++
 net/ipv4/tcp.c           | 34 ++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c      |  1 +
 net/ipv4/udp.c           | 18 ++++++++++++++++++
 net/l2tp/l2tp_ip.c       |  1 +
 net/mptcp/protocol.c     |  1 +
 net/sctp/protocol.c      |  1 +
 net/socket.c             | 13 +++++++++++++
 17 files changed, 134 insertions(+), 2 deletions(-)

-- 
2.34.1




^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2023-05-03 13:27 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-02  9:21 [PATCH 0/5] add initial io_uring_cmd support for sockets Adrien Delorme
2023-05-02 13:03 ` Pavel Begunkov
2023-05-03 13:11   ` Adrien Delorme
2023-05-03 13:27     ` David Laight
  -- strict thread matches above, loose matches on Subject: below --
2023-04-06 14:43 Breno Leitao
2023-04-06 15:34 ` Willem de Bruijn
2023-04-06 15:59   ` Breno Leitao
2023-04-06 18:16     ` Willem de Bruijn
2023-04-07  2:46       ` David Ahern
2023-04-11 12:00         ` Breno Leitao
2023-04-11 14:36           ` David Ahern
2023-04-11 14:41             ` Jens Axboe
2023-04-11 14:51               ` Willem de Bruijn
2023-04-11 14:54                 ` Jens Axboe
2023-04-11 15:00                   ` Willem de Bruijn
2023-04-11 15:06                     ` Jens Axboe
2023-04-11 15:24                       ` Willem de Bruijn
2023-04-11 15:28                         ` Jens Axboe
2023-04-12 13:53                           ` Breno Leitao
2023-04-12 14:28                             ` Willem de Bruijn
2023-04-13  0:02                               ` Breno Leitao
2023-04-13 14:24                                 ` Willem de Bruijn
2023-04-13 14:45                                   ` Jakub Kicinski
2023-04-13 14:57                                   ` David Laight
2023-04-18 13:23                                   ` Breno Leitao
2023-04-18 19:41                                     ` Willem de Bruijn
2023-04-20 14:43                                       ` Breno Leitao
2023-04-20 16:48                                         ` Willem de Bruijn
2023-04-11 15:10               ` David Ahern
2023-04-11 15:17                 ` Jens Axboe
2023-04-11 15:27                   ` David Ahern
2023-04-11 15:29                     ` Jens Axboe
2023-04-12  7:39                     ` David Laight
2023-04-06 16:41 ` Keith Busch
2023-04-06 16:49   ` Jens Axboe
2023-04-06 16:58   ` Breno Leitao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox