public inbox for [email protected]
 help / color / mirror / Atom feed
From: lizetao <[email protected]>
To: David Wei <[email protected]>,
	"[email protected]" <[email protected]>,
	"[email protected]" <[email protected]>
Cc: Jens Axboe <[email protected]>,
	Pavel Begunkov <[email protected]>,
	Jakub Kicinski <[email protected]>, Paolo Abeni <[email protected]>,
	"David S. Miller" <[email protected]>,
	Eric Dumazet <[email protected]>,
	"Jesper Dangaard Brouer" <[email protected]>,
	David Ahern <[email protected]>,
	"Mina Almasry" <[email protected]>,
	Stanislav Fomichev <[email protected]>,
	Joe Damato <[email protected]>,
	Pedro Tammela <[email protected]>
Subject: RE: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area
Date: Mon, 27 Jan 2025 02:55:56 +0000	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Hi,

> -----Original Message-----
> From: David Wei <[email protected]>
> Sent: Friday, January 17, 2025 7:17 AM
> To: [email protected]; [email protected]
> Cc: Jens Axboe <[email protected]>; Pavel Begunkov <[email protected]>;
> Jakub Kicinski <[email protected]>; Paolo Abeni <[email protected]>; David S.
> Miller <[email protected]>; Eric Dumazet <[email protected]>;
> Jesper Dangaard Brouer <[email protected]>; David Ahern
> <[email protected]>; Mina Almasry <[email protected]>; Stanislav
> Fomichev <[email protected]>; Joe Damato <[email protected]>;
> Pedro Tammela <[email protected]>
> Subject: [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area
> 
> Add io_zcrx_area that represents a region of userspace memory that is used for
> zero copy. During ifq registration, userspace passes in the uaddr and len of
> userspace memory, which is then pinned by the kernel.
> Each net_iov is mapped to one of these pages.
> 
> The freelist is a spinlock protected list that keeps track of all the net_iovs/pages
> that aren't used.
> 
> For now, there is only one area per ifq and area registration happens implicitly
> as part of ifq registration. There is no API for adding/removing areas yet. The
> struct for area registration is there for future extensibility once we support
> multiple areas and TCP devmem.
> 
> Reviewed-by: Jens Axboe <[email protected]>
> Signed-off-by: Pavel Begunkov <[email protected]>
> Signed-off-by: David Wei <[email protected]>
> ---
>  include/uapi/linux/io_uring.h |  9 ++++
>  io_uring/rsrc.c               |  2 +-
>  io_uring/rsrc.h               |  1 +
>  io_uring/zcrx.c               | 89
> ++++++++++++++++++++++++++++++++++-
>  io_uring/zcrx.h               | 16 +++++++
>  5 files changed, 114 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index
> 3af8b7a19824..e251f28507ce 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -980,6 +980,15 @@ struct io_uring_zcrx_offsets {
>  	__u64	__resv[2];
>  };
> 
> +struct io_uring_zcrx_area_reg {
> +	__u64	addr;
> +	__u64	len;
> +	__u64	rq_area_token;
> +	__u32	flags;
> +	__u32	__resv1;
> +	__u64	__resv2[2];
> +};
> +
>  /*
>   * Argument for IORING_REGISTER_ZCRX_IFQ
>   */
> diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..d0f11b5aec0d
> 100644
> --- a/io_uring/rsrc.c
> +++ b/io_uring/rsrc.c
> @@ -77,7 +77,7 @@ static int io_account_mem(struct io_ring_ctx *ctx,
> unsigned long nr_pages)
>  	return 0;
>  }
> 
> -static int io_buffer_validate(struct iovec *iov)
> +int io_buffer_validate(struct iovec *iov)
>  {
>  	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
> 
> diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index c8b093584461..0ae54ddeb1fd
> 100644
> --- a/io_uring/rsrc.h
> +++ b/io_uring/rsrc.h
> @@ -66,6 +66,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void
> __user *arg,
>  			    unsigned size, unsigned type);
>  int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
>  			unsigned int size, unsigned int type);
> +int io_buffer_validate(struct iovec *iov);
> 
>  bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
>  			      struct io_imu_folio_data *data); diff --git
> a/io_uring/zcrx.c b/io_uring/zcrx.c index f3ace7e8264d..04883a3ae80c 100644
> --- a/io_uring/zcrx.c
> +++ b/io_uring/zcrx.c
> @@ -10,6 +10,7 @@
>  #include "kbuf.h"
>  #include "memmap.h"
>  #include "zcrx.h"
> +#include "rsrc.h"
> 
>  #define IO_RQ_MAX_ENTRIES		32768
> 
> @@ -44,6 +45,79 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
>  	ifq->rqes = NULL;
>  }
> 
> +static void io_zcrx_free_area(struct io_zcrx_area *area) {
> +	kvfree(area->freelist);
> +	kvfree(area->nia.niovs);
> +	if (area->pages) {
> +		unpin_user_pages(area->pages, area->nia.num_niovs);
> +		kvfree(area->pages);
> +	}
> +	kfree(area);
> +}
> +
> +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
> +			       struct io_zcrx_area **res,
> +			       struct io_uring_zcrx_area_reg *area_reg) {
> +	struct io_zcrx_area *area;
> +	int i, ret, nr_pages;
> +	struct iovec iov;
> +
> +	if (area_reg->flags || area_reg->rq_area_token)
> +		return -EINVAL;
> +	if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
> +		return -EINVAL;
> +	if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
> +		return -EINVAL;
> +
> +	iov.iov_base = u64_to_user_ptr(area_reg->addr);
> +	iov.iov_len = area_reg->len;
> +	ret = io_buffer_validate(&iov);
> +	if (ret)
> +		return ret;
> +
> +	ret = -ENOMEM;
> +	area = kzalloc(sizeof(*area), GFP_KERNEL);
> +	if (!area)
> +		goto err;
> +
> +	area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
> +				   &nr_pages);
> +	if (IS_ERR(area->pages)) {
> +		ret = PTR_ERR(area->pages);
> +		area->pages = NULL;
> +		goto err;
> +	}
> +	area->nia.num_niovs = nr_pages;
> +
> +	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
> +					 GFP_KERNEL | __GFP_ZERO);
> +	if (!area->nia.niovs)
> +		goto err;
> +
> +	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
> +					GFP_KERNEL | __GFP_ZERO);
> +	if (!area->freelist)
> +		goto err;
> +
> +	for (i = 0; i < nr_pages; i++)
> +		area->freelist[i] = i;

This is redundant as patch 14 will reinitialize it.
> +
> +	area->free_count = nr_pages;
> +	area->ifq = ifq;
> +	/* we're only supporting one area per ifq for now */
> +	area->area_id = 0;
> +	area_reg->rq_area_token = (u64)area->area_id <<
> IORING_ZCRX_AREA_SHIFT;
> +	spin_lock_init(&area->freelist_lock);
> +	*res = area;
> +	return 0;
> +err:
> +	if (area)
> +		io_zcrx_free_area(area);
> +	return ret;
> +}
> +
>  static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)  {
>  	struct io_zcrx_ifq *ifq;
> @@ -59,6 +133,9 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct
> io_ring_ctx *ctx)
> 
>  static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)  {
> +	if (ifq->area)
> +		io_zcrx_free_area(ifq->area);
> +
>  	io_free_rbuf_ring(ifq);
>  	kfree(ifq);
>  }
> @@ -66,6 +143,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)  int
> io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  			  struct io_uring_zcrx_ifq_reg __user *arg)  {
> +	struct io_uring_zcrx_area_reg area;
>  	struct io_uring_zcrx_ifq_reg reg;
>  	struct io_uring_region_desc rd;
>  	struct io_zcrx_ifq *ifq;
> @@ -99,7 +177,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  	}
>  	reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
> 
> -	if (!reg.area_ptr)
> +	if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr),
> +sizeof(area)))
>  		return -EFAULT;
> 
>  	ifq = io_zcrx_ifq_alloc(ctx);
> @@ -110,6 +188,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  	if (ret)
>  		goto err;
> 
> +	ret = io_zcrx_create_area(ifq, &ifq->area, &area);
> +	if (ret)
> +		goto err;
> +
>  	ifq->rq_entries = reg.rq_entries;
>  	ifq->if_rxq = reg.if_rxq;
> 
> @@ -122,7 +204,10 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
>  		ret = -EFAULT;
>  		goto err;
>  	}
> -
> +	if (copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) {
> +		ret = -EFAULT;
> +		goto err;
> +	}
>  	ctx->ifq = ifq;
>  	return 0;
>  err:
> diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index
> 58e4ab6c6083..53fd94b65b38 100644
> --- a/io_uring/zcrx.h
> +++ b/io_uring/zcrx.h
> @@ -3,9 +3,25 @@
>  #define IOU_ZC_RX_H
> 
>  #include <linux/io_uring_types.h>
> +#include <net/page_pool/types.h>
> +
> +struct io_zcrx_area {
> +	struct net_iov_area	nia;
> +	struct io_zcrx_ifq	*ifq;
> +
> +	u16			area_id;
> +	struct page		**pages;
> +
> +	/* freelist */
> +	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
> +	u32			free_count;
> +	u32			*freelist;
> +};
> 
>  struct io_zcrx_ifq {
>  	struct io_ring_ctx		*ctx;
> +	struct io_zcrx_area		*area;
> +
>  	struct io_uring			*rq_ring;
>  	struct io_uring_zcrx_rqe	*rqes;
>  	u32				rq_entries;
> --
> 2.43.5
> 
> 

--
Li Zetao

  reply	other threads:[~2025-01-27  2:55 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-16 23:16 [PATCH net-next v11 00/21] io_uring zero copy rx David Wei
2025-01-16 23:16 ` [PATCH net-next v11 01/21] net: page_pool: don't cast mp param to devmem David Wei
2025-01-16 23:16 ` [PATCH net-next v11 02/21] net: prefix devmem specific helpers David Wei
2025-01-16 23:16 ` [PATCH net-next v11 03/21] net: generalise net_iov chunk owners David Wei
2025-01-16 23:16 ` [PATCH net-next v11 04/21] net: page_pool: create hooks for custom memory providers David Wei
2025-01-17  1:46   ` Jakub Kicinski
2025-01-17  1:48     ` Jakub Kicinski
2025-01-17  2:15       ` David Wei
2025-01-16 23:16 ` [PATCH net-next v11 05/21] netdev: add io_uring memory provider info David Wei
2025-01-16 23:16 ` [PATCH net-next v11 06/21] net: page_pool: add callback for mp info printing David Wei
2025-01-16 23:16 ` [PATCH net-next v11 07/21] net: page_pool: add a mp hook to unregister_netdevice* David Wei
2025-01-16 23:16 ` [PATCH net-next v11 08/21] net: prepare for non devmem TCP memory providers David Wei
2025-01-16 23:16 ` [PATCH net-next v11 09/21] net: page_pool: add memory provider helpers David Wei
2025-01-17  1:47   ` Jakub Kicinski
2025-01-16 23:16 ` [PATCH net-next v11 10/21] net: add helpers for setting a memory provider on an rx queue David Wei
2025-01-17  1:50   ` Jakub Kicinski
2025-01-17  1:52   ` Jakub Kicinski
2025-01-17  2:17     ` David Wei
2025-01-17  2:25   ` Jakub Kicinski
2025-01-17  2:47     ` Pavel Begunkov
2025-01-17 22:11       ` Jakub Kicinski
2025-01-17 23:20         ` Pavel Begunkov
2025-01-18  2:08           ` Jakub Kicinski
2025-01-18  3:09             ` Pavel Begunkov
2025-01-16 23:16 ` [PATCH net-next v11 11/21] io_uring/zcrx: add interface queue and refill queue David Wei
2025-01-16 23:16 ` [PATCH net-next v11 12/21] io_uring/zcrx: add io_zcrx_area David Wei
2025-01-27  2:55   ` lizetao [this message]
2025-01-16 23:16 ` [PATCH net-next v11 13/21] io_uring/zcrx: grab a net device David Wei
2025-01-16 23:16 ` [PATCH net-next v11 14/21] io_uring/zcrx: implement zerocopy receive pp memory provider David Wei
2025-01-17  2:07   ` Jakub Kicinski
2025-01-17  2:17     ` David Wei
2025-01-16 23:16 ` [PATCH net-next v11 15/21] io_uring/zcrx: dma-map area for the device David Wei
2025-01-16 23:16 ` [PATCH net-next v11 16/21] io_uring/zcrx: add io_recvzc request David Wei
2025-01-16 23:16 ` [PATCH net-next v11 17/21] io_uring/zcrx: set pp memory provider for an rx queue David Wei
2025-01-17  2:13   ` Jakub Kicinski
2025-01-17  2:38     ` Pavel Begunkov
2025-01-16 23:17 ` [PATCH net-next v11 18/21] io_uring/zcrx: throttle receive requests David Wei
2025-01-16 23:17 ` [PATCH net-next v11 19/21] io_uring/zcrx: add copy fallback David Wei
2025-01-16 23:17 ` [PATCH net-next v11 20/21] net: add documentation for io_uring zcrx David Wei
2025-01-16 23:17 ` [PATCH net-next v11 21/21] io_uring/zcrx: add selftest David Wei
2025-01-17 14:28 ` [PATCH net-next v11 00/21] io_uring zero copy rx Paolo Abeni
2025-01-17 14:42   ` Pavel Begunkov
2025-01-17 16:05     ` Paolo Abeni
2025-01-17 16:19       ` Pavel Begunkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox