From: Jens Axboe <axboe@kernel.dk>
To: io-uring@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Subject: [PATCH 5/5] io_uring: allow registration of per-task restrictions
Date: Fri, 16 Jan 2026 15:38:42 -0700 [thread overview]
Message-ID: <20260116224356.399361-6-axboe@kernel.dk> (raw)
In-Reply-To: <20260116224356.399361-1-axboe@kernel.dk>
Currently io_uring supports restricting operations on a per-ring basis.
To use those, the ring must be setup in a disabled state by setting
IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and
the ring can then be enabled.
This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd
== -1, like the other "blind" register opcodes which work on the task
rather than a specific ring. This allows registration of the same kind
of restrictions as can been done on a specific ring, but with the task
itself. Once done, any ring created will inherit these restrictions.
If a restriction filter is registered with a task, then it's inherited
on fork for its children. Children may only further restrict operations,
not extend them.
Inheriting restrictions include both the classic
IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF
filters that have been registered with the task via
IORING_REGISTER_BPF_FILTER.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
include/linux/io_uring_types.h | 2 +
include/uapi/linux/io_uring.h | 7 ++++
io_uring/bpf_filter.c | 70 ++++++++++++++++++++++++++++++++++
io_uring/bpf_filter.h | 6 +++
io_uring/io_uring.c | 19 +++++++++
io_uring/io_uring.h | 1 +
io_uring/register.c | 65 +++++++++++++++++++++++++++++++
io_uring/tctx.c | 35 ++++++++++++-----
8 files changed, 196 insertions(+), 9 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1e91fa7ecbaf..f4a55c104825 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -236,6 +236,8 @@ struct io_restriction {
DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
struct io_bpf_filters *bpf_filters;
+ /* ->bpf_filters needs COW on modification */
+ bool bpf_filters_cow;
u8 sqe_flags_allowed;
u8 sqe_flags_required;
/* IORING_OP_* restrictions exist */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 27839318c43e..419bdfb48b9c 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -808,6 +808,13 @@ struct io_uring_restriction {
__u32 resv2[3];
};
+struct io_uring_task_restriction {
+ __u16 flags;
+ __u16 nr_res;
+ __u32 resv[3];
+ __DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
+};
+
struct io_uring_clock_register {
__u32 clockid;
__u32 __resv[3];
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 8ed5b913005a..30a9d7355cd7 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -215,6 +215,70 @@ static struct io_bpf_filters *io_new_bpf_filters(void)
return filters;
}
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+ if (!src->bpf_filters)
+ return;
+
+ rcu_read_lock();
+ /*
+ * If the src filter is going away, just ignore it.
+ */
+ if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
+ dst->bpf_filters = src->bpf_filters;
+ dst->bpf_filters_cow = true;
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
+ * modifications need to be made.
+ */
+static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
+{
+ struct io_bpf_filters *filters;
+ struct io_bpf_filter *srcf;
+ int i;
+
+ filters = io_new_bpf_filters();
+ if (IS_ERR(filters))
+ return filters;
+
+ /*
+ * Iterate filters from src and assign in destination. Grabbing
+ * a reference is enough, we don't need to duplicate the memory.
+ * This is safe because filters are only ever appended to the
+ * front of the list, hence the only memory ever touched inside
+ * a filter is the refcount.
+ */
+ rcu_read_lock();
+ for (i = 0; i < IORING_OP_LAST; i++) {
+ srcf = rcu_dereference(src->bpf_filters->filters[i]);
+ if (!srcf) {
+ continue;
+ } else if (srcf == &dummy_filter) {
+ rcu_assign_pointer(filters->filters[i], &dummy_filter);
+ continue;
+ }
+
+ /*
+ * Getting a ref on the first node is enough, putting the
+ * filter and iterating nodes to free will stop on the first
+ * one that doesn't hit zero when dropping.
+ */
+ if (!refcount_inc_not_zero(&srcf->refs))
+ goto err;
+ rcu_assign_pointer(filters->filters[i], srcf);
+ }
+ rcu_read_unlock();
+ return filters;
+err:
+ rcu_read_unlock();
+ __io_put_bpf_filters(filters);
+ return ERR_PTR(-EBUSY);
+}
+
int io_register_bpf_filter(struct io_restriction *res,
struct io_uring_bpf __user *arg)
{
@@ -247,6 +311,12 @@ int io_register_bpf_filter(struct io_restriction *res,
filters = io_new_bpf_filters();
if (IS_ERR(filters))
return PTR_ERR(filters);
+ } else if (res->bpf_filters_cow) {
+ filters = io_bpf_filter_cow(res);
+ if (IS_ERR(filters))
+ return PTR_ERR(filters);
+ __io_put_bpf_filters(res->bpf_filters);
+ res->bpf_filters_cow = false;
}
prog = bpf_prog_get_type(reg.filter.prog_fd, BPF_PROG_TYPE_IO_URING);
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h
index a131953ce950..3f117a4c8752 100644
--- a/io_uring/bpf_filter.h
+++ b/io_uring/bpf_filter.h
@@ -11,6 +11,8 @@ int io_register_bpf_filter(struct io_restriction *res,
void io_put_bpf_filters(struct io_restriction *res);
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
+
static inline int io_uring_run_bpf_filters(struct io_restriction *res,
struct io_kiocb *req)
{
@@ -35,6 +37,10 @@ static inline int io_uring_run_bpf_filters(struct io_restriction *res,
static inline void io_put_bpf_filters(struct io_restriction *res)
{
}
+static inline void io_bpf_filter_clone(struct io_restriction *dst,
+ struct io_restriction *src)
+{
+}
#endif /* CONFIG_IO_URING */
#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 67533e494836..8e9d300b8604 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3562,6 +3562,18 @@ int io_prepare_config(struct io_ctx_config *config)
return 0;
}
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+ memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
+ memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
+ dst->sqe_flags_allowed = src->sqe_flags_allowed;
+ dst->sqe_flags_required = src->sqe_flags_required;
+ dst->op_registered = src->op_registered;
+ dst->reg_registered = src->reg_registered;
+
+ io_bpf_filter_clone(dst, src);
+}
+
static __cold int io_uring_create(struct io_ctx_config *config)
{
struct io_uring_params *p = &config->p;
@@ -3622,6 +3634,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
else
ctx->notify_method = TWA_SIGNAL;
+ /*
+ * If the current task has restrictions enabled, then copy them to
+ * our newly created ring and mark it as registered.
+ */
+ if (current->io_uring_restrict)
+ io_restriction_clone(&ctx->restrictions, current->io_uring_restrict);
+
/*
* This is just grabbed for accounting purposes. When a process exits,
* the mm is exited and dropped before the files, hence we need to hang
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index c5bbb43b5842..feb9f76761e9 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -195,6 +195,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
void io_activate_pollwq(struct io_ring_ctx *ctx);
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/register.c b/io_uring/register.c
index 30957c2cb5eb..12164b4e03aa 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -190,6 +190,67 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
return 0;
}
+static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
+{
+ struct io_uring_task_restriction __user *ures = arg;
+ struct io_uring_task_restriction tres;
+ struct io_restriction *res;
+ int ret;
+
+ /* Disallow if task already has registered restrictions */
+ if (current->io_uring_restrict)
+ return -EPERM;
+ if (nr_args != 1)
+ return -EINVAL;
+
+ if (copy_from_user(&tres, arg, sizeof(tres)))
+ return -EFAULT;
+
+ if (tres.flags)
+ return -EINVAL;
+ if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
+ return -EINVAL;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+ if (!res)
+ return -ENOMEM;
+
+ ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
+ if (ret < 0) {
+ kfree(res);
+ return ret;
+ }
+ current->io_uring_restrict = res;
+ return 0;
+}
+
+static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
+{
+ struct io_restriction *res;
+ int ret;
+
+ if (nr_args != 1)
+ return -EINVAL;
+
+ /* If no task restrictions exist, setup a new set */
+ res = current->io_uring_restrict;
+ if (!res) {
+ res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+ if (!res)
+ return -ENOMEM;
+ }
+
+ ret = io_register_bpf_filter(res, arg);
+ if (ret) {
+ if (res != current->io_uring_restrict)
+ kfree(res);
+ return ret;
+ }
+ if (!current->io_uring_restrict)
+ current->io_uring_restrict = res;
+ return 0;
+}
+
static int io_register_enable_rings(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_R_DISABLED))
@@ -909,6 +970,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
return io_uring_register_send_msg_ring(arg, nr_args);
case IORING_REGISTER_QUERY:
return io_query(arg, nr_args);
+ case IORING_REGISTER_RESTRICTIONS:
+ return io_register_restrictions_task(arg, nr_args);
+ case IORING_REGISTER_BPF_FILTER:
+ return io_register_bpf_filter_task(arg, nr_args);
}
return -EINVAL;
}
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index cca13d291cfd..2c05e8f66172 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -11,6 +11,8 @@
#include "io_uring.h"
#include "tctx.h"
+#include "register.h"
+#include "bpf_filter.h"
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
struct task_struct *task)
@@ -54,16 +56,23 @@ void __io_uring_free(struct task_struct *tsk)
* node is stored in the xarray. Until that gets sorted out, attempt
* an iteration here and warn if any entries are found.
*/
- xa_for_each(&tctx->xa, index, node) {
- WARN_ON_ONCE(1);
- break;
- }
- WARN_ON_ONCE(tctx->io_wq);
- WARN_ON_ONCE(tctx->cached_refs);
+ if (tctx) {
+ xa_for_each(&tctx->xa, index, node) {
+ WARN_ON_ONCE(1);
+ break;
+ }
+ WARN_ON_ONCE(tctx->io_wq);
+ WARN_ON_ONCE(tctx->cached_refs);
- percpu_counter_destroy(&tctx->inflight);
- kfree(tctx);
- tsk->io_uring = NULL;
+ percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx);
+ tsk->io_uring = NULL;
+ }
+ if (tsk->io_uring_restrict) {
+ io_put_bpf_filters(tsk->io_uring_restrict);
+ kfree(tsk->io_uring_restrict);
+ tsk->io_uring_restrict = NULL;
+ }
}
__cold int io_uring_alloc_task_context(struct task_struct *task,
@@ -354,5 +363,13 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
int __io_uring_fork(struct task_struct *tsk)
{
+ struct io_restriction *res, *src = tsk->io_uring_restrict;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+ if (!res)
+ return -ENOMEM;
+
+ tsk->io_uring_restrict = res;
+ io_restriction_clone(res, src);
return 0;
}
--
2.51.0
prev parent reply other threads:[~2026-01-16 22:44 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-01-16 22:38 [PATCHSET RFC v4] Inherited restrictions and BPF filtering Jens Axboe
2026-01-16 22:38 ` [PATCH 1/5] io_uring: add support for BPF filtering for opcode restrictions Jens Axboe
2026-01-16 22:38 ` [PATCH 2/5] io_uring/net: allow filtering on IORING_OP_SOCKET data Jens Axboe
2026-01-16 22:38 ` [PATCH 3/5] io_uring/bpf_filter: add ref counts to struct io_bpf_filter Jens Axboe
2026-01-16 22:38 ` [PATCH 4/5] io_uring: add task fork hook Jens Axboe
2026-01-16 22:38 ` Jens Axboe [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260116224356.399361-6-axboe@kernel.dk \
--to=axboe@kernel.dk \
--cc=io-uring@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox