From: Jens Axboe <axboe@kernel.dk>
To: io-uring@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 2/3] io_uring: add support for BPF filtering for opcode restrictions
Date: Thu, 15 Jan 2026 09:36:33 -0700 [thread overview]
Message-ID: <20260115165244.1037465-3-axboe@kernel.dk> (raw)
In-Reply-To: <20260115165244.1037465-1-axboe@kernel.dk>
This adds support for loading BPF programs with io_uring, which can
restrict the opcodes performed. Unlike IORING_REGISTER_RESTRICTIONS,
using BPF programs allow fine grained control over both the opcode
in question, as well as other data associated with the request.
Initially only IORING_OP_SOCKET is supported.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
include/linux/bpf.h | 1 +
include/linux/bpf_types.h | 4 +
include/linux/io_uring_types.h | 16 +++
include/uapi/linux/bpf.h | 1 +
include/uapi/linux/io_uring.h | 37 ++++++
io_uring/Makefile | 1 +
io_uring/bpf_filter.c | 212 +++++++++++++++++++++++++++++++++
io_uring/bpf_filter.h | 41 +++++++
io_uring/io_uring.c | 7 ++
io_uring/net.c | 9 ++
io_uring/net.h | 5 +
io_uring/register.c | 33 ++++-
kernel/bpf/syscall.c | 9 ++
13 files changed, 375 insertions(+), 1 deletion(-)
create mode 100644 io_uring/bpf_filter.c
create mode 100644 io_uring/bpf_filter.h
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e5be698256d1..9b4435452458 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -31,6 +31,7 @@
#include <linux/static_call.h>
#include <linux/memcontrol.h>
#include <linux/cfi.h>
+#include <linux/io_uring_types.h>
#include <asm/rqspinlock.h>
struct bpf_verifier_env;
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index b13de31e163f..c5d58806a1cf 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
struct bpf_nf_ctx, struct bpf_nf_ctx)
#endif
+#ifdef CONFIG_IO_URING
+BPF_PROG_TYPE(BPF_PROG_TYPE_IO_URING, io_uring_filter,
+ struct io_uring_bpf_ctx, struct io_uring_bpf_ctx)
+#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index c664c84247f1..4b18dfc63764 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -219,6 +219,17 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
+#ifdef CONFIG_BPF
+extern const struct bpf_prog_ops io_uring_filter_prog_ops;
+extern const struct bpf_verifier_ops io_uring_filter_verifier_ops;
+#endif
+
+struct io_bpf_filter;
+struct io_bpf_filters {
+ spinlock_t lock;
+ struct io_bpf_filter __rcu **bpf_filters;
+};
+
struct io_restriction {
refcount_t refs;
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
@@ -229,6 +240,10 @@ struct io_restriction {
bool op_registered;
/* IORING_REGISTER_* restrictions exist */
bool reg_registered;
+ /* BPF filter restrictions exists */
+ bool bpf_registered;
+ struct io_bpf_filters filters;
+ struct rcu_head rcu_head;
};
struct io_submit_link {
@@ -265,6 +280,7 @@ struct io_ring_ctx {
unsigned int drain_next: 1;
unsigned int op_restricted: 1;
unsigned int reg_restricted: 1;
+ unsigned int bpf_restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int has_evfd: 1;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f8d8513eda27..4d43ec003887 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1072,6 +1072,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_IO_URING,
__MAX_BPF_PROG_TYPE
};
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index b5b23c0d5283..0e1b0871fe5e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -700,6 +700,9 @@ enum io_uring_register_op {
/* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
IORING_REGISTER_ZCRX_CTRL = 36,
+ /* register bpf filtering programs */
+ IORING_REGISTER_BPF_FILTER = 37,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -1113,6 +1116,40 @@ struct zcrx_ctrl {
};
};
+struct io_uring_bpf_ctx {
+ __u8 opcode;
+ __u8 sqe_flags;
+ __u8 pad[6];
+ __u64 user_data;
+ union {
+ struct {
+ __u32 family;
+ __u32 type;
+ __u32 protocol;
+ } socket;
+ };
+};
+
+struct io_uring_bpf_filter {
+ __u32 opcode; /* io_uring opcode to filter */
+ __u32 flags;
+ __s32 prog_fd; /* BPF program fd */
+ __u32 reserved[3];
+};
+
+enum {
+ IO_URING_BPF_CMD_FILTER = 1,
+};
+
+struct io_uring_bpf {
+ __u16 cmd_type; /* IO_URING_BPF_* values */
+ __u16 cmd_flags; /* none so far */
+ __u32 resv;
+ union {
+ struct io_uring_bpf_filter filter;
+ };
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/Makefile b/io_uring/Makefile
index bc4e4a3fa0a5..d89bd0cf6363 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
obj-$(CONFIG_NET) += net.o cmd_net.o
obj-$(CONFIG_PROC_FS) += fdinfo.o
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
+obj-$(CONFIG_BPF) += bpf_filter.o
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
new file mode 100644
index 000000000000..d31bff1984b7
--- /dev/null
+++ b/io_uring/bpf_filter.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF filter support for io_uring. Supports SQE opcodes for now.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/io_uring.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "bpf_filter.h"
+#include "net.h"
+
+struct io_bpf_filter {
+ struct bpf_prog *prog;
+ struct io_bpf_filter *next;
+};
+
+static bool io_uring_filter_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (type != BPF_READ)
+ return false;
+ if (off < 0 || off >= sizeof(struct io_uring_bpf_ctx))
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+/* Convert context field access if needed */
+static u32 io_uring_filter_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ /* Direct access is fine - context is read-only and passed directly */
+ switch (si->off) {
+ case offsetof(struct io_uring_bpf_ctx, opcode):
+ case offsetof(struct io_uring_bpf_ctx, sqe_flags):
+ case offsetof(struct io_uring_bpf_ctx, user_data):
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, si->off);
+ break;
+ default:
+ /* Union fields - also direct access */
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, si->off);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+/* BTF ID for the context type */
+BTF_ID_LIST_SINGLE(io_uring_filter_btf_ids, struct, io_uring_bpf_ctx)
+
+/* Program operations */
+const struct bpf_prog_ops io_uring_filter_prog_ops = { };
+
+/* Verifier operations */
+const struct bpf_verifier_ops io_uring_filter_verifier_ops = {
+ .get_func_proto = bpf_base_func_proto,
+ .is_valid_access = io_uring_filter_is_valid_access,
+ .convert_ctx_access = io_uring_filter_convert_ctx_access,
+};
+
+/* Populate BPF context from SQE */
+static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
+ struct io_kiocb *req)
+{
+ memset(bctx, 0, sizeof(*bctx));
+ bctx->opcode = req->opcode;
+ bctx->sqe_flags = req->flags & SQE_VALID_FLAGS;
+ bctx->user_data = req->cqe.user_data;
+
+ switch (req->opcode) {
+ case IORING_OP_SOCKET:
+ io_socket_bpf_populate(bctx, req);
+ break;
+ }
+}
+
+/*
+ * Run registered filters for a given opcode. Return of 0 means that the
+ * request should be allowed.
+ */
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
+{
+ struct io_bpf_filter *filter;
+ struct io_uring_bpf_ctx bpf_ctx;
+ int ret;
+
+ rcu_read_lock();
+ filter = rcu_dereference(res->filters.bpf_filters[req->opcode]);
+ if (!filter || !filter->prog) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ io_uring_populate_bpf_ctx(&bpf_ctx, req);
+
+ do {
+ ret = bpf_prog_run(filter->prog, &bpf_ctx);
+ if (!ret)
+ break;
+ filter = filter->next;
+ } while (filter);
+
+ rcu_read_unlock();
+ return ret ? 0 : -EACCES;
+}
+
+int io_register_bpf_filter(struct io_restriction *res,
+ struct io_uring_bpf_filter __user *arg)
+{
+ struct io_bpf_filter *filter, *old_filter;
+ struct io_bpf_filter **filters;
+ struct io_uring_bpf reg;
+ struct bpf_prog *prog;
+
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+ if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+ return -EINVAL;
+ if (reg.cmd_flags || reg.resv)
+ return -EINVAL;
+
+ if (reg.filter.opcode >= IORING_OP_LAST)
+ return -EINVAL;
+ if (reg.filter.flags ||
+ !mem_is_zero(reg.filter.reserved, sizeof(reg.filter.reserved)))
+ return -EINVAL;
+ if (reg.filter.prog_fd < 0)
+ return -EBADF;
+
+ /*
+ * No existing filters, allocate set.
+ */
+ filters = res->filters.bpf_filters;
+ if (!filters) {
+ filters = kcalloc(IORING_OP_LAST, sizeof(struct io_bpf_filter *), GFP_KERNEL_ACCOUNT);
+ if (!filters)
+ return -ENOMEM;
+ }
+
+ prog = bpf_prog_get_type(reg.filter.prog_fd, BPF_PROG_TYPE_IO_URING);
+ if (IS_ERR(prog)) {
+ if (filters != res->filters.bpf_filters)
+ kfree(filters);
+ return PTR_ERR(prog);
+ }
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
+ if (!filter) {
+ if (filters != res->filters.bpf_filters)
+ kfree(filters);
+ bpf_prog_put(prog);
+ return -ENOMEM;
+ }
+ filter->prog = prog;
+ res->filters.bpf_filters = filters;
+
+ /*
+ * Insert filter - if the current opcode already has a filter
+ * attached, add to the set.
+ */
+ spin_lock(&res->filters.lock);
+ old_filter = rcu_dereference(filters[reg.filter.opcode]);
+ if (old_filter)
+ filter->next = old_filter;
+ rcu_assign_pointer(filters[reg.filter.opcode], filter);
+ spin_unlock(&res->filters.lock);
+ res->bpf_registered = 1;
+ return 0;
+}
+
+void io_uring_put_bpf_filters(struct io_restriction *res)
+{
+ struct io_bpf_filters *filters = &res->filters;
+ int i;
+
+ if (!filters->bpf_filters)
+ return;
+ if (!res->bpf_registered)
+ return;
+
+ res->bpf_registered = 0;
+ for (i = 0; i < IORING_OP_LAST; i++) {
+ struct io_bpf_filter *filter;
+
+ filter = rcu_dereference(filters->bpf_filters[i]);
+ while (filter) {
+ struct io_bpf_filter *next = filter->next;
+
+ if (filter->prog)
+ bpf_prog_put(filter->prog);
+ kfree(filter);
+ filter = next;
+ }
+ }
+ kfree(filters->bpf_filters);
+ filters->bpf_filters = NULL;
+}
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h
new file mode 100644
index 000000000000..3cc53e0a3789
--- /dev/null
+++ b/io_uring/bpf_filter.h
@@ -0,0 +1,41 @@
+#ifndef IO_URING_BPF_FILTER_H
+#define IO_URING_BPF_FILTER_H
+
+#ifdef CONFIG_BPF
+
+void io_uring_put_bpf_filters(struct io_restriction *res);
+
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req);
+
+int io_register_bpf_filter(struct io_restriction *res,
+ struct io_uring_bpf_filter __user *arg);
+
+static inline int io_uring_run_bpf_filters(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_restriction *res = ctx->restrictions;
+
+ if (res && res->filters.bpf_filters)
+ return __io_uring_run_bpf_filters(res, req);
+
+ return 0;
+}
+
+#else
+
+static inline int io_register_bpf_filter(struct io_restriction *res,
+ struct io_uring_bpf_filter __user *arg)
+{
+ return -EINVAL;
+}
+static inline int io_uring_run_bpf_filters(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ return 0;
+}
+static inline void io_uring_put_bpf_filters(struct io_restriction *res)
+{
+}
+#endif /* CONFIG_IO_URING */
+
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index eec8da38a596..80aeb498ec8a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -93,6 +93,7 @@
#include "rw.h"
#include "alloc_cache.h"
#include "eventfd.h"
+#include "bpf_filter.h"
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -2261,6 +2262,12 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (unlikely(ret))
return io_submit_fail_init(sqe, req, ret);
+ if (unlikely(ctx->bpf_restricted)) {
+ ret = io_uring_run_bpf_filters(ctx, req);
+ if (ret)
+ return io_submit_fail_init(sqe, req, ret);
+ }
+
trace_io_uring_submit_req(req);
/*
diff --git a/io_uring/net.c b/io_uring/net.c
index 519ea055b761..4fcba36bd0bb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1699,6 +1699,15 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
return IOU_COMPLETE;
}
+void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req)
+{
+ struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
+
+ bctx->socket.family = sock->domain;
+ bctx->socket.type = sock->type;
+ bctx->socket.protocol = sock->protocol;
+}
+
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
diff --git a/io_uring/net.h b/io_uring/net.h
index 43e5ce5416b7..eef6b4272d01 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -44,6 +44,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags);
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_socket(struct io_kiocb *req, unsigned int issue_flags);
+void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx, struct io_kiocb *req);
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
@@ -64,4 +65,8 @@ void io_netmsg_cache_free(const void *entry);
static inline void io_netmsg_cache_free(const void *entry)
{
}
+static inline void io_socket_bpf_populate(struct io_uring_bpf_ctx *bctx,
+ struct io_kiocb *req)
+{
+}
#endif
diff --git a/io_uring/register.c b/io_uring/register.c
index 6c99b441d886..cb006d53a146 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -33,6 +33,7 @@
#include "memmap.h"
#include "zcrx.h"
#include "query.h"
+#include "bpf_filter.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -163,10 +164,19 @@ static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
return ret;
}
+static void io_free_restrictions(struct rcu_head *head)
+{
+ struct io_restriction *res;
+
+ res = container_of(head, struct io_restriction, rcu_head);
+ io_uring_put_bpf_filters(res);
+ kfree(res);
+}
+
void io_put_restrictions(struct io_restriction *res)
{
if (refcount_dec_and_test(&res->refs))
- kfree(res);
+ call_rcu(&res->rcu_head, io_free_restrictions);
}
static struct io_restriction *io_alloc_restrictions(void)
@@ -178,6 +188,7 @@ static struct io_restriction *io_alloc_restrictions(void)
return ERR_PTR(-ENOMEM);
refcount_set(&res->refs, 1);
+ spin_lock_init(&res->filters.lock);
return res;
}
@@ -853,6 +864,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_ZCRX_CTRL:
ret = io_zcrx_ctrl(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_BPF_FILTER:
+ ret = -EINVAL;
+ if (nr_args != 1)
+ break;
+#ifdef CONFIG_BPF
+ if (!ctx->restrictions) {
+ struct io_restriction *res;
+
+ res = io_alloc_restrictions();
+ if (IS_ERR(res)) {
+ ret = PTR_ERR(res);
+ break;
+ }
+ ctx->restrictions = res;
+ }
+ ret = io_register_bpf_filter(ctx->restrictions, arg);
+ if (ctx->restrictions->bpf_registered)
+ ctx->bpf_restricted = 1;
+#endif
+ break;
default:
ret = -EINVAL;
break;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ff82144f885..d12537d918f7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2752,6 +2752,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
if (expected_attach_type == BPF_NETFILTER)
return 0;
return -EINVAL;
+ case BPF_PROG_TYPE_IO_URING:
+ if (expected_attach_type)
+ return -EINVAL;
+ return 0;
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
if (expected_attach_type)
@@ -2934,6 +2938,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
}
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+ type != BPF_PROG_TYPE_IO_URING &&
!bpf_cap)
goto put_token;
@@ -4403,6 +4408,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (attach_type != BPF_NETFILTER)
return -EINVAL;
return 0;
+ case BPF_PROG_TYPE_IO_URING:
+ if (attach_type != 0)
+ return -EINVAL;
+ return 0;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
if (attach_type != BPF_PERF_EVENT)
--
2.51.0
next prev parent reply other threads:[~2026-01-15 16:52 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-15 16:36 [PATCHSET RFC v3] Inherited restrictions and BPF filtering Jens Axboe
2026-01-15 16:36 ` [PATCH 1/3] io_uring: move ctx->restrictions to be dynamically allocated Jens Axboe
2026-01-15 16:36 ` Jens Axboe [this message]
2026-01-15 20:11 ` [PATCH 2/3] io_uring: add support for BPF filtering for opcode restrictions Jonathan Corbet
2026-01-15 21:02 ` Jens Axboe
2026-01-15 21:05 ` Jonathan Corbet
2026-01-15 21:08 ` Jens Axboe
2026-01-15 16:36 ` [PATCH 3/3] io_uring: allow registration of per-task restrictions Jens Axboe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260115165244.1037465-3-axboe@kernel.dk \
--to=axboe@kernel.dk \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox