From: Pavel Begunkov <[email protected]>
To: [email protected], [email protected],
[email protected]
Cc: Jakub Kicinski <[email protected]>,
Jonathan Lemon <[email protected]>,
"David S . Miller" <[email protected]>,
Willem de Bruijn <[email protected]>,
Eric Dumazet <[email protected]>,
Hideaki YOSHIFUJI <[email protected]>,
David Ahern <[email protected]>, Jens Axboe <[email protected]>,
Pavel Begunkov <[email protected]>
Subject: [RFC 05/12] net: optimise page get/free for bvec zc
Date: Tue, 30 Nov 2021 15:18:53 +0000 [thread overview]
Message-ID: <72608c13553a1372e7f6f7a32eb53d5d4b23a1fc.1638282789.git.asml.silence@gmail.com> (raw)
In-Reply-To: <[email protected]>
get_page() in __zerocopy_sg_from_bvec() and matching put_page()s are
expensive. However, we can avoid it if the caller can guarantee that
pages stay alive until the corresponding ubuf_info is not released.
In particular, it targets io_uring with fixed buffers following the
described contract.
Assuming that nobody yet uses bvec together with zerocopy, make all
calls with bvec iterators follow this model.
Signed-off-by: Pavel Begunkov <[email protected]>
---
include/linux/skbuff.h | 10 +++++++++-
net/core/datagram.c | 9 +++++++--
net/core/skbuff.c | 16 +++++++++++++---
net/ipv4/ip_output.c | 4 ++++
4 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 750b7518d6e2..ebb12a7d386d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -461,6 +461,11 @@ enum {
SKBFL_PURE_ZEROCOPY = BIT(2),
SKBFL_DONT_ORPHAN = BIT(3),
+
+ /* page references are managed by the ubuf_info, so it's safe to
+ * use frags only up until ubuf_info is released
+ */
+ SKBFL_MANAGED_FRAGS = BIT(4),
};
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
@@ -3154,7 +3159,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
*/
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
- __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ if (!(shinfo->flags & SKBFL_MANAGED_FRAGS))
+ __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}
/**
diff --git a/net/core/datagram.c b/net/core/datagram.c
index e00f7e0a7a0a..5cf0672039d6 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -642,7 +642,6 @@ static int __zerocopy_sg_from_bvec(struct sock *sk, struct sk_buff *skb,
v = mp_bvec_iter_bvec(from->bvec, bi);
copied += v.bv_len;
truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
- get_page(v.bv_page);
skb_fill_page_desc(skb, frag++, v.bv_page, v.bv_offset, v.bv_len);
bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
}
@@ -671,9 +670,15 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length)
{
int frag = skb_shinfo(skb)->nr_frags;
+ bool managed = skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS;
- if (iov_iter_is_bvec(from))
+ if (iov_iter_is_bvec(from) && (managed || frag == 0)) {
+ skb_shinfo(skb)->flags |= SKBFL_MANAGED_FRAGS;
return __zerocopy_sg_from_bvec(sk, skb, from, length);
+ }
+
+ if (managed)
+ return -EFAULT;
while (length && iov_iter_count(from)) {
struct page *pages[MAX_SKB_FRAGS];
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b23db60ea6f9..b7b087815539 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -666,10 +666,14 @@ static void skb_release_data(struct sk_buff *skb)
&shinfo->dataref))
goto exit;
- skb_zcopy_clear(skb, true);
+ if (!(shinfo->flags & SKBFL_MANAGED_FRAGS)) {
+ for (i = 0; i < shinfo->nr_frags; i++)
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ } else {
+ shinfo->flags &= ~SKBFL_MANAGED_FRAGS;
+ }
- for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+ skb_zcopy_clear(skb, true);
if (shinfo->frag_list)
kfree_skb_list(shinfo->frag_list);
@@ -1471,6 +1475,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
/* skb frags release userspace buffers */
for (i = 0; i < num_frags; i++)
skb_frag_unref(skb, i);
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS;
/* skb frags point to kernel buffers */
for (i = 0; i < new_frags - 1; i++) {
@@ -1597,6 +1602,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
skb_copy_header(n, skb);
+ skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS;
return n;
}
EXPORT_SYMBOL(skb_copy);
@@ -1653,6 +1659,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
+ skb_shinfo(n)->flags &= ~SKBFL_MANAGED_FRAGS;
}
if (skb_has_frag_list(skb)) {
@@ -1725,6 +1732,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
refcount_inc(&skb_uarg(skb)->refcnt);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAGS;
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
@@ -3788,6 +3796,8 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
} else if (i < MAX_SKB_FRAGS) {
+ if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS)
+ return -EMSGSIZE;
get_page(page);
skb_fill_page_desc(skb, i, page, offset, size);
} else {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9aab355d283..e6adf96e5530 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1194,6 +1194,10 @@ static int __ip_append_data(struct sock *sk,
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
+ if (skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAGS) {
+ err = -EMSGSIZE;
+ goto error;
+ }
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
--
2.34.0
next prev parent reply other threads:[~2021-11-30 15:20 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-11-30 15:18 [RFC 00/12] io_uring zerocopy send Pavel Begunkov
2021-11-30 15:18 ` [RFC 01/12] skbuff: add SKBFL_DONT_ORPHAN flag Pavel Begunkov
2021-11-30 15:18 ` [RFC 02/12] skbuff: pass a struct ubuf_info in msghdr Pavel Begunkov
2021-11-30 15:18 ` [RFC 03/12] net/udp: add support msgdr::msg_ubuf Pavel Begunkov
2021-11-30 15:18 ` [RFC 04/12] net: add zerocopy_sg_from_iter for bvec Pavel Begunkov
2021-11-30 15:18 ` Pavel Begunkov [this message]
2021-12-01 19:20 ` [RFC 05/12] net: optimise page get/free for bvec zc Jonathan Lemon
2021-12-01 20:17 ` Pavel Begunkov
2021-11-30 15:18 ` [RFC 06/12] io_uring: add send notifiers registration Pavel Begunkov
2021-11-30 15:18 ` [RFC 07/12] io_uring: infrastructure for send zc notifications Pavel Begunkov
2021-11-30 15:18 ` [RFC 08/12] io_uring: wire send zc request type Pavel Begunkov
2021-11-30 15:18 ` [RFC 09/12] io_uring: add an option to flush zc notifications Pavel Begunkov
2021-11-30 15:18 ` [RFC 10/12] io_uring: opcode independent fixed buf import Pavel Begunkov
2021-11-30 15:18 ` [RFC 11/12] io_uring: sendzc with fixed buffers Pavel Begunkov
2021-11-30 15:19 ` [RFC 12/12] io_uring: cache struct ubuf_info Pavel Begunkov
2021-12-01 3:10 ` [RFC 00/12] io_uring zerocopy send David Ahern
2021-12-01 15:32 ` Pavel Begunkov
2021-12-01 17:57 ` David Ahern
[not found] ` <[email protected]>
2021-12-01 19:20 ` David Ahern
2021-12-01 20:15 ` Pavel Begunkov
2021-12-01 21:51 ` Martin KaFai Lau
2021-12-01 22:35 ` David Ahern
2021-12-01 23:07 ` Martin KaFai Lau
2021-12-01 23:18 ` Pavel Begunkov
2021-12-02 15:48 ` Pavel Begunkov
2021-12-02 17:40 ` Martin KaFai Lau
2021-12-01 20:42 ` Pavel Begunkov
2021-12-01 14:31 ` Pavel Begunkov
2021-12-01 17:49 ` David Ahern
2021-12-01 19:59 ` Pavel Begunkov
2021-12-01 18:10 ` Willem de Bruijn
2021-12-01 19:59 ` Pavel Begunkov
2021-12-01 20:29 ` Pavel Begunkov
2021-12-02 0:36 ` Willem de Bruijn
2021-12-02 16:25 ` Pavel Begunkov
2021-12-02 0:32 ` Willem de Bruijn
2021-12-02 16:45 ` Pavel Begunkov
2021-12-02 21:25 ` Willem de Bruijn
2021-12-03 16:19 ` Pavel Begunkov
2021-12-03 16:30 ` Willem de Bruijn
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=72608c13553a1372e7f6f7a32eb53d5d4b23a1fc.1638282789.git.asml.silence@gmail.com \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox