From: Pavel Begunkov <[email protected]>
To: [email protected], [email protected],
[email protected]
Cc: "David S . Miller" <[email protected]>,
Jakub Kicinski <[email protected]>,
Jonathan Lemon <[email protected]>,
Willem de Bruijn <[email protected]>,
Jens Axboe <[email protected]>,
[email protected], Pavel Begunkov <[email protected]>
Subject: [RFC net-next v3 15/29] io_uring: add zc notification infrastructure
Date: Tue, 28 Jun 2022 19:56:37 +0100 [thread overview]
Message-ID: <4b2a76541e91194a146788bcd401f438f5b4b45d.1653992701.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>
Add internal part of send zerocopy notifications. There are two main
structures, the first one is struct io_notif, which carries inside
struct ubuf_info and maps 1:1 to it. io_uring will be binding a number
of zerocopy send requests to it and ask to complete (aka flush) it. When
flushed and all attached requests and skbs complete, it'll generate one
and only one CQE. There are intended to be passed into the network layer
as struct msghdr::msg_ubuf.
The second concept is notification slots. The userspace will be able to
register an array of slots and subsequently addressing them by the index
in the array. Slots are independent of each other. Each slot can have
only one notifier at a time (called active notifier) but many notifiers
during the lifetime. When active, a notifier not going to post any
completion but the userspace can attach requests to it by specifying
the corresponding slot while issueing send zc requests. Eventually, the
userspace will want to "flush" the notifier losing any way to attach
new requests to it, however it can use the next atomatically added
notifier of this slot or of any other slot.
When the network layer is done with all enqueued skbs attached to a
notifier and doesn't need the specified in them user data, the flushed
notifier will post a CQE.
Signed-off-by: Pavel Begunkov <[email protected]>
---
fs/io_uring.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 156 insertions(+)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e47629adf3f7..7d058deb5f73 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -371,6 +371,43 @@ struct io_ev_fd {
struct rcu_head rcu;
};
+#define IO_NOTIF_MAX_SLOTS (1U << 10)
+
+struct io_notif {
+ struct ubuf_info uarg;
+ struct io_ring_ctx *ctx;
+
+ /* cqe->user_data, io_notif_slot::tag if not overridden */
+ u64 tag;
+ /* see struct io_notif_slot::seq */
+ u32 seq;
+
+ union {
+ struct callback_head task_work;
+ struct work_struct commit_work;
+ };
+};
+
+struct io_notif_slot {
+ /*
+ * Current/active notifier. A slot holds only one active notifier at a
+ * time and keeps one reference to it. Flush releases the reference and
+ * lazily replaces it with a new notifier.
+ */
+ struct io_notif *notif;
+
+ /*
+ * Default ->user_data for this slot notifiers CQEs
+ */
+ u64 tag;
+ /*
+ * Notifiers of a slot live in generations, we create a new notifier
+ * only after flushing the previous one. Track the sequential number
+ * for all notifiers and copy it into notifiers's cqe->cflags
+ */
+ u32 seq;
+};
+
#define BGID_ARRAY 64
struct io_ring_ctx {
@@ -423,6 +460,8 @@ struct io_ring_ctx {
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
+ struct io_notif_slot *notif_slots;
+ unsigned nr_notif_slots;
struct io_submit_state submit_state;
@@ -2749,6 +2788,121 @@ static __cold void io_free_req(struct io_kiocb *req)
spin_unlock(&ctx->completion_lock);
}
+static void __io_notif_complete_tw(struct callback_head *cb)
+{
+ struct io_notif *notif = container_of(cb, struct io_notif, task_work);
+ struct io_ring_ctx *ctx = notif->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ io_cqring_ev_posted(ctx);
+
+ percpu_ref_put(&ctx->refs);
+ kfree(notif);
+}
+
+static inline void io_notif_complete(struct io_notif *notif)
+{
+ __io_notif_complete_tw(¬if->task_work);
+}
+
+static void io_notif_complete_wq(struct work_struct *work)
+{
+ struct io_notif *notif = container_of(work, struct io_notif, commit_work);
+
+ io_notif_complete(notif);
+}
+
+static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
+ struct ubuf_info *uarg,
+ bool success)
+{
+ struct io_notif *notif = container_of(uarg, struct io_notif, uarg);
+
+ if (!refcount_dec_and_test(&uarg->refcnt))
+ return;
+ INIT_WORK(¬if->commit_work, io_notif_complete_wq);
+ queue_work(system_unbound_wq, ¬if->commit_work);
+}
+
+static struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx,
+ struct io_notif_slot *slot)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_notif *notif;
+
+ notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT);
+ if (!notif)
+ return NULL;
+
+ notif->seq = slot->seq++;
+ notif->tag = slot->tag;
+ notif->ctx = ctx;
+ notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ notif->uarg.callback = io_uring_tx_zerocopy_callback;
+ /* master ref owned by io_notif_slot, will be dropped on flush */
+ refcount_set(¬if->uarg.refcnt, 1);
+ percpu_ref_get(&ctx->refs);
+ return notif;
+}
+
+__attribute__((unused))
+static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx,
+ struct io_notif_slot *slot)
+{
+ if (!slot->notif)
+ slot->notif = io_alloc_notif(ctx, slot);
+ return slot->notif;
+}
+
+__attribute__((unused))
+static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
+ int idx)
+ __must_hold(&ctx->uring_lock)
+{
+ if (idx >= ctx->nr_notif_slots)
+ return NULL;
+ idx = array_index_nospec(idx, ctx->nr_notif_slots);
+ return &ctx->notif_slots[idx];
+}
+
+static void io_notif_slot_flush(struct io_notif_slot *slot)
+ __must_hold(&ctx->uring_lock)
+{
+ struct io_notif *notif = slot->notif;
+
+ slot->notif = NULL;
+
+ if (WARN_ON_ONCE(in_interrupt()))
+ return;
+ /* drop slot's master ref */
+ if (refcount_dec_and_test(¬if->uarg.refcnt))
+ io_notif_complete(notif);
+}
+
+static __cold int io_notif_unregister(struct io_ring_ctx *ctx)
+ __must_hold(&ctx->uring_lock)
+{
+ int i;
+
+ if (!ctx->notif_slots)
+ return -ENXIO;
+
+ for (i = 0; i < ctx->nr_notif_slots; i++) {
+ struct io_notif_slot *slot = &ctx->notif_slots[i];
+
+ if (slot->notif)
+ io_notif_slot_flush(slot);
+ }
+
+ kvfree(ctx->notif_slots);
+ ctx->notif_slots = NULL;
+ ctx->nr_notif_slots = 0;
+ return 0;
+}
+
static inline void io_remove_next_linked(struct io_kiocb *req)
{
struct io_kiocb *nxt = req->link;
@@ -11174,6 +11328,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
}
#endif
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
+ WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -11368,6 +11523,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
__io_cqring_overflow_flush(ctx, true);
xa_for_each(&ctx->personalities, index, creds)
io_unregister_personality(ctx, index);
+ io_notif_unregister(ctx);
mutex_unlock(&ctx->uring_lock);
/* failed during ring init, it couldn't have issued any requests */
--
2.36.1
next prev parent reply other threads:[~2022-06-28 19:01 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-28 18:56 [RFC net-next v3 00/29] io_uring zerocopy send Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 01/29] ipv4: avoid partial copy for zc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 02/29] ipv6: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 03/29] skbuff: add SKBFL_DONT_ORPHAN flag Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 04/29] skbuff: carry external ubuf_info in msghdr Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 05/29] net: bvec specific path in zerocopy_sg_from_iter Pavel Begunkov
2022-06-28 20:06 ` Al Viro
2022-06-28 21:33 ` Pavel Begunkov
2022-06-28 22:52 ` David Ahern
2022-07-04 13:31 ` Pavel Begunkov
2022-07-05 2:28 ` David Ahern
2022-07-05 14:03 ` Pavel Begunkov
2022-07-05 22:09 ` Pavel Begunkov
2022-07-06 15:11 ` David Ahern
2022-06-28 18:56 ` [RFC net-next v3 06/29] net: optimise bvec-based zc page referencing Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 07/29] net: don't track pfmemalloc for managed frags Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 08/29] skbuff: don't mix ubuf_info of different types Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 09/29] ipv4/udp: support zc with managed data Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 10/29] ipv6/udp: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 11/29] tcp: " Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 12/29] tcp: kill extra io_uring's uarg refcounting Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 13/29] net: let callers provide extra ubuf_info refs Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 14/29] io_uring: opcode independent fixed buf import Pavel Begunkov
2022-06-28 18:56 ` Pavel Begunkov [this message]
2022-06-28 18:56 ` [RFC net-next v3 16/29] io_uring: cache struct io_notif Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 17/29] io_uring: complete notifiers in tw Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 18/29] io_uring: add notification slot registration Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 19/29] io_uring: rename IORING_OP_FILES_UPDATE Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 20/29] io_uring: add zc notification flush requests Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 21/29] io_uring: wire send zc request type Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 22/29] io_uring: account locked pages for non-fixed zc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 23/29] io_uring: allow to pass addr into sendzc Pavel Begunkov
2022-06-29 7:42 ` Stefan Metzmacher
2022-06-29 9:53 ` Pavel Begunkov
2022-08-13 8:45 ` Stefan Metzmacher
2022-08-15 9:46 ` Pavel Begunkov
2022-08-15 11:40 ` Stefan Metzmacher
2022-08-15 12:19 ` Pavel Begunkov
2022-08-15 13:30 ` Stefan Metzmacher
2022-08-15 14:09 ` Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 24/29] io_uring: add rsrc referencing for notifiers Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 25/29] io_uring: sendzc with fixed buffers Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 26/29] io_uring: flush notifiers after sendzc Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 27/29] io_uring: allow to override zc tag on flush Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 28/29] io_uring: batch submission notif referencing Pavel Begunkov
2022-06-28 18:56 ` [RFC net-next v3 29/29] selftests/io_uring: test zerocopy send Pavel Begunkov
2022-06-28 19:03 ` [RFC net-next v3 00/29] io_uring " Pavel Begunkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4b2a76541e91194a146788bcd401f438f5b4b45d.1653992701.git.asml.silence@gmail.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox