public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Gabriel Krisman Bertazi <krisman@suse.de>
To: axboe@kernel.dk
Cc: io-uring@vger.kernel.org,
	Gabriel Krisman Bertazi <krisman@suse.de>,
	Andrew Morton <akpm@linux-foundation.org>,
	David Hildenbrand <david@kernel.org>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	Vlastimil Babka <vbabka@suse.cz>,
	"Liam R. Howlett" <Liam.Howlett@oracle.com>,
	Mike Rapoport <rppt@kernel.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Michal Hocko <mhocko@suse.com>,
	linux-mm@kvack.org
Subject: [PATCH 2/2] io_uring: introduce IORING_OP_MMAP
Date: Thu, 29 Jan 2026 17:11:38 -0500	[thread overview]
Message-ID: <20260129221138.897715-3-krisman@suse.de> (raw)
In-Reply-To: <20260129221138.897715-1-krisman@suse.de>

This enables mmap(2) over io_uring.  The interesting part is allowing
the mapping of multiple regions with different parameters in a single
operation. This is not explored in this patch, but coalescing multiple
operations can enable batching deeper in the MM layer.

The SQE provides an array of memory descriptors to be mapped backed by
fd, or to anonymous memory if fd == -1. All descriptors are mapped against
the same file, but protections and flags can vary.

The API also tries to be very clear about what failed in case of an
error. The number of maps that succeeded is returned on the CQE, and the
error code of the first failed map is passed back via the descriptor
structure (which must live until completion).

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@kernel.org>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
---
 include/uapi/linux/io_uring.h |  10 +++
 io_uring/Makefile             |   2 +-
 io_uring/mmap.c               | 147 ++++++++++++++++++++++++++++++++++
 io_uring/mmap.h               |   4 +
 io_uring/opdef.c              |   9 +++
 5 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 io_uring/mmap.c
 create mode 100644 io_uring/mmap.h

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..e24fe3b00059 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -74,6 +74,7 @@ struct io_uring_sqe {
 		__u32		install_fd_flags;
 		__u32		nop_flags;
 		__u32		pipe_flags;
+		__u32		mmap_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -303,6 +304,7 @@ enum io_uring_op {
 	IORING_OP_PIPE,
 	IORING_OP_NOP128,
 	IORING_OP_URING_CMD128,
+	IORING_OP_MMAP,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -1113,6 +1115,14 @@ struct zcrx_ctrl {
 	};
 };
 
+struct io_uring_mmap_desc {
+	void __user *addr;
+	unsigned long len;
+	unsigned long pgoff;
+	unsigned int prot;
+	unsigned int flags;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index bc4e4a3fa0a5..be0fa605f87d 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
 					sync.o msg_ring.o advise.o openclose.o \
 					statx.o timeout.o cancel.o \
 					waitid.o register.o truncate.o \
-					memmap.o alloc_cache.o query.o
+					memmap.o mmap.o alloc_cache.o query.o
 obj-$(CONFIG_IO_URING_ZCRX)	+= zcrx.o
 obj-$(CONFIG_IO_WQ)		+= io-wq.o
 obj-$(CONFIG_FUTEX)		+= futex.o
diff --git a/io_uring/mmap.c b/io_uring/mmap.c
new file mode 100644
index 000000000000..14b960707bb2
--- /dev/null
+++ b/io_uring/mmap.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/io_uring.h>
+#include <linux/hugetlb.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/shm.h>
+#include <linux/mman.h>
+#include <linux/audit.h>
+#include "../mm/internal.h"
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "mmap.h"
+#include "rsrc.h"
+
+struct io_mmap_data {
+	struct file *file;
+	unsigned long flags;
+	struct io_uring_mmap_desc __user *uaddr;
+};
+struct io_mmap_async {
+	int nr_maps;
+	struct io_uring_mmap_desc maps[] __counted_by(nr_maps);
+};
+
+#define MMAP_MAX_BATCH 1024
+
+int io_mmap_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_mmap_data *mmap = io_kiocb_to_cmd(req, struct io_mmap_data);
+	struct io_mmap_async *maps;
+	int nr_maps;
+
+	mmap->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	mmap->flags = READ_ONCE(sqe->mmap_flags);
+	nr_maps = READ_ONCE(sqe->len);
+
+	if (mmap->flags & MAP_ANONYMOUS && req->cqe.fd != -1)
+		return -EINVAL;
+	if (nr_maps < 0 || nr_maps > MMAP_MAX_BATCH)
+		return -EINVAL;
+	if (!access_ok(mmap->uaddr, nr_maps*sizeof(struct io_uring_mmap_desc)))
+		return -EFAULT;
+
+	maps = kzalloc(struct_size_t(struct io_mmap_async, maps, nr_maps),
+		       GFP_KERNEL);
+	if (!maps)
+		return -ENOMEM;
+	maps->nr_maps = nr_maps;
+
+	req->flags |= REQ_F_ASYNC_DATA;
+	req->async_data = maps;
+	return 0;
+}
+
+static int io_prep_mmap_hugetlb(struct file **filp, unsigned long *len,
+				int flags)
+{
+	if (*filp) {
+		*len = ALIGN(*len, huge_page_size(hstate_file(*filp)));
+	} else {
+		struct hstate *hs;
+		unsigned long nlen = *len;
+
+		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+		if (!hs)
+			return -EINVAL;
+		nlen = ALIGN(nlen, huge_page_size(hs));
+		*filp = hugetlb_file_setup(HUGETLB_ANON_FILE, nlen,
+					   VM_NORESERVE,
+					   HUGETLB_ANONHUGE_INODE,
+				   (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
+
+		if (IS_ERR(*filp))
+			return PTR_ERR(*filp);
+		*len = nlen;
+	}
+	return 0;
+}
+
+int io_mmap(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_mmap_data *mmap = io_kiocb_to_cmd(req, struct io_mmap_data);
+	struct io_mmap_async *data = (struct io_mmap_async *) req->async_data;
+	int i, mapped, ret;
+
+	if (unlikely(mmap->flags & MAP_HUGETLB && req->file &&
+		     !is_file_hugepages(req->file))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < data->nr_maps; i++) {
+		struct io_uring_mmap_desc *desc = &data->maps[i];
+
+		if (copy_from_user(desc, &mmap->uaddr[i], sizeof(*desc))) {
+			ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	mapped = 0;
+	while (mapped < data->nr_maps) {
+		struct io_uring_mmap_desc *desc = &data->maps[mapped++];
+		unsigned long flags = (mmap->flags | desc->flags);
+		unsigned long len = desc->len;
+		struct file *file = req->file;
+
+		/* These cannot be mixed and matched.  need to be passed
+		 * on the SQE.
+		 */
+		if (unlikely(desc->flags & (MAP_ANONYMOUS|MAP_HUGETLB))) {
+			desc->addr = ERR_PTR(-EINVAL);
+			break;
+		}
+		if (!(flags & MAP_ANONYMOUS))
+			audit_mmap_fd(req->cqe.fd, flags);
+
+		if (unlikely(flags & MAP_HUGETLB)) {
+			ret = io_prep_mmap_hugetlb(&file, &len, flags);
+			if (ret) {
+				desc->addr = ERR_PTR(-ret);
+				break;
+			}
+		}
+
+		desc->addr = (void *) vm_mmap_pgoff(file,
+					   (unsigned long) desc->addr,
+					   len, desc->prot, flags, desc->pgoff);
+		if (IS_ERR_OR_NULL(desc->addr))
+			break;
+	}
+
+	if (copy_to_user(mmap->uaddr, data->maps,
+			 sizeof(struct io_uring_mmap_desc)*mapped))
+		ret = -EFAULT;
+
+	ret = mapped;
+out:
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_set_res(req, ret, 0);
+	return IOU_COMPLETE;
+}
diff --git a/io_uring/mmap.h b/io_uring/mmap.h
new file mode 100644
index 000000000000..acddf6db76e7
--- /dev/null
+++ b/io_uring/mmap.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+int io_mmap_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_mmap(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index df52d760240e..679e413d2395 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -29,6 +29,7 @@
 #include "epoll.h"
 #include "statx.h"
 #include "net.h"
+#include "mmap.h"
 #include "msg_ring.h"
 #include "timeout.h"
 #include "poll.h"
@@ -593,6 +594,11 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_uring_cmd_prep,
 		.issue			= io_uring_cmd,
 	},
+	[IORING_OP_MMAP] = {
+		.prep			= io_mmap_prep,
+		.issue			= io_mmap,
+		.opt_file		= 1,
+	}
 };
 
 const struct io_cold_def io_cold_defs[] = {
@@ -851,6 +857,9 @@ const struct io_cold_def io_cold_defs[] = {
 		.sqe_copy		= io_uring_cmd_sqe_copy,
 		.cleanup		= io_uring_cmd_cleanup,
 	},
+	[IORING_OP_MMAP] = {
+		.name			= "MMAP",
+	},
 };
 
 const char *io_uring_get_opcode(u8 opcode)
-- 
2.52.0


  parent reply	other threads:[~2026-01-29 22:11 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-29 22:11 [PATCH 0/2] Introduce IORING_OP_MMAP Gabriel Krisman Bertazi
2026-01-29 22:11 ` [PATCH 1/2] io_uring: Support commands with optional file descriptors Gabriel Krisman Bertazi
2026-01-29 22:11 ` Gabriel Krisman Bertazi [this message]
2026-01-30  6:03   ` [PATCH 2/2] io_uring: introduce IORING_OP_MMAP kernel test robot
2026-01-30 15:47     ` Gabriel Krisman Bertazi
2026-01-30 15:55   ` Jens Axboe
2026-02-01 17:46 ` [PATCH 0/2] Introduce IORING_OP_MMAP David Hildenbrand (arm)
2026-02-01 18:16   ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260129221138.897715-3-krisman@suse.de \
    --to=krisman@suse.de \
    --cc=Liam.Howlett@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=axboe@kernel.dk \
    --cc=david@kernel.org \
    --cc=io-uring@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=mhocko@suse.com \
    --cc=rppt@kernel.org \
    --cc=surenb@google.com \
    --cc=vbabka@suse.cz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox