From: Keith Busch <[email protected]>
To: <[email protected]>, <[email protected]>,
<[email protected]>, <[email protected]>
Cc: <[email protected]>, <[email protected]>,
Alexander Viro <[email protected]>,
Keith Busch <[email protected]>
Subject: [PATCH 4/5] io_uring: add support for dma pre-mapping
Date: Tue, 26 Jul 2022 10:38:13 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Keith Busch <[email protected]>
Provide a new register operation that can request to pre-map a known
bvec to the driver of the requested file descriptor's specific
implementation. If successful, io_uring will use the returned dma tag
for future fixed buffer requests to the same file.
Signed-off-by: Keith Busch <[email protected]>
---
include/uapi/linux/io_uring.h | 12 ++++
io_uring/io_uring.c | 129 ++++++++++++++++++++++++++++++++++
io_uring/net.c | 2 +-
io_uring/rsrc.c | 13 +++-
io_uring/rsrc.h | 16 ++++-
io_uring/rw.c | 2 +-
6 files changed, 166 insertions(+), 8 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,
+ /* dma map registered buffers */
+ IORING_REGISTER_MAP_BUFFERS = 28,
+ IORING_REGISTER_UNMAP_BUFFERS = 29,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
+struct io_uring_map_buffers {
+ __s32 fd;
+ __s32 buf_start;
+ __s32 buf_end;
+ __u32 flags;
+ __u64 rsvd[2];
+};
+
#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d600a63643b..12f7354e0423 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3704,6 +3704,123 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return ret;
}
+#ifdef CONFIG_BLOCK
+static int get_map_range(struct io_ring_ctx *ctx,
+ struct io_uring_map_buffers *map, void __user *arg)
+{
+ int ret;
+
+ if (copy_from_user(map, arg, sizeof(*map)))
+ return -EFAULT;
+ if (map->flags || map->rsvd[0] || map->rsvd[1])
+ return -EINVAL;
+ if (map->buf_start < 0)
+ return -EINVAL;
+ if (map->buf_start >= ctx->nr_user_bufs)
+ return -EINVAL;
+ if (map->buf_end > ctx->nr_user_bufs)
+ map->buf_end = ctx->nr_user_bufs;
+
+ ret = map->buf_end - map->buf_start;
+ if (ret <= 0)
+ return -EINVAL;
+
+ return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+ if (imu->dma_tag)
+ block_dma_unmap(imu->bdev, imu->dma_tag);
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ int i, ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+
+ return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ struct block_device *bdev;
+ struct file *file;
+ int ret, i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ file = fget(map.fd);
+ if (!file)
+ return -EBADF;
+
+ if (S_ISBLK(file_inode(file)->i_mode))
+ bdev = I_BDEV(file->f_mapping->host);
+ else if (S_ISREG(file_inode(file)->i_mode))
+ bdev = file->f_inode->i_sb->s_bdev;
+ else
+ return -EOPNOTSUPP;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+ void *tag;
+
+ if (imu->dma_tag) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
+ if (IS_ERR(tag)) {
+ ret = PTR_ERR(tag);
+ goto err;
+ }
+
+ imu->dma_tag = tag;
+ imu->dma_file = file;
+ imu->bdev = bdev;
+ }
+
+ fput(file);
+ return 0;
+err:
+ while (--i >= map.buf_start) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+ fput(file);
+ return ret;
+}
+#else /* CONFIG_BLOCK */
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BLOCK */
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -3870,6 +3987,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_notif_unregister(ctx);
break;
+ case IORING_REGISTER_MAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_map_buffers(ctx, arg);
+ break;
+ case IORING_REGISTER_UNMAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_unmap_buffers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 8276b9537194..68a996318959 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
+ (u64)(uintptr_t)zc->buf, zc->len, NULL);
if (unlikely(ret))
return ret;
} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1a7a8dedbbd5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
+ io_dma_unmap(imu);
kvfree(imu);
}
*slot = NULL;
@@ -1285,6 +1286,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->dma_tag = NULL;
*pimu = imu;
ret = 0;
done:
@@ -1359,9 +1361,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file)
{
u64 buf_end;
size_t offset;
@@ -1379,6 +1380,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
+ if (imu->dma_tag && file == imu->dma_file) {
+ unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) +
+ (len >> PAGE_SHIFT);
+ iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len);
+ return 0;
+ }
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..6e63b7a57b34 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,11 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ void *dma_tag;
+ struct file *dma_file;
+#ifdef CONFIG_BLOCK
+ struct block_device *bdev;
+#endif
struct bio_vec bvec[];
};
@@ -64,9 +69,14 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file);
+
+#ifdef CONFIG_BLOCK
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+#else
+static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {}
+#endif
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
if (ret)
return ERR_PTR(ret);
return NULL;
--
2.30.2
next prev parent reply other threads:[~2022-07-26 17:38 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-26 17:38 [PATCH 0/5] dma mapping optimisations Keith Busch
2022-07-26 17:38 ` [PATCH 1/5] blk-mq: add ops to dma map bvec Keith Busch
2022-07-26 17:38 ` [PATCH 2/5] iov_iter: introduce type for preregistered dma tags Keith Busch
2022-07-26 23:10 ` Al Viro
2022-07-27 13:52 ` Keith Busch
2022-07-26 17:38 ` [PATCH 3/5] block: add dma tag bio type Keith Busch
2022-07-26 17:38 ` Keith Busch [this message]
2022-07-26 23:12 ` [PATCH 4/5] io_uring: add support for dma pre-mapping Al Viro
2022-07-27 13:58 ` Keith Busch
2022-07-27 14:04 ` Al Viro
2022-07-27 15:04 ` Keith Busch
2022-07-27 22:32 ` Dave Chinner
2022-07-27 23:00 ` Keith Busch
2022-07-28 2:35 ` Dave Chinner
2022-07-28 13:25 ` Keith Busch
2022-07-27 14:11 ` Al Viro
2022-07-27 14:48 ` Keith Busch
2022-07-27 15:26 ` Al Viro
2022-07-26 17:38 ` [PATCH 5/5] nvme-pci: implement dma_map support Keith Busch
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox