public inbox for io-uring@vger.kernel.org
 help / color / mirror / Atom feed
From: Jens Axboe <axboe@kernel.dk>
To: Christian Brauner <brauner@kernel.org>
Cc: io-uring@vger.kernel.org, jannh@google.com, kees@kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions
Date: Tue, 27 Jan 2026 09:41:00 -0700	[thread overview]
Message-ID: <269f6e6a-362f-4ccf-926f-2e2e0c1ea014@kernel.dk> (raw)
In-Reply-To: <20260127-lobgesang-lautsprecher-3805340711c8@brauner>

On 1/27/26 3:06 AM, Christian Brauner wrote:
>> +int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
>> +{
>> +	struct io_bpf_filter *filter;
>> +	struct io_uring_bpf_ctx bpf_ctx;
>> +	int ret;
>> +
>> +	/* Fast check for existence of filters outside of RCU */
>> +	if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
>> +		return 0;
>> +
>> +	/*
>> +	 * req->opcode has already been validated to be within the range
>> +	 * of what we expect, io_init_req() does this.
>> +	 */
>> +	rcu_read_lock();
>> +	filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
>> +	if (!filter) {
>> +		ret = 1;
>> +		goto out;
>> +	} else if (filter == &dummy_filter) {
>> +		ret = 0;
>> +		goto out;
>> +	}
>> +
>> +	io_uring_populate_bpf_ctx(&bpf_ctx, req);
>> +
>> +	/*
>> +	 * Iterate registered filters. The opcode is allowed IFF all filters
>> +	 * return 1. If any filter returns denied, opcode will be denied.
>> +	 */
>> +	do {
>> +		if (filter == &dummy_filter)
>> +			ret = 0;
>> +		else
>> +			ret = bpf_prog_run(filter->prog, &bpf_ctx);
>> +		if (!ret)
>> +			break;
>> +		filter = filter->next;
>> +	} while (filter);
>> +out:
>> +	rcu_read_unlock();
>> +	return ret ? 0 : -EACCES;
>> +}
> 
> Maybe we can write this a little nicer?:
> 
> int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
> {
> 	struct io_bpf_filter *filter;
> 	struct io_uring_bpf_ctx bpf_ctx;
> 
> 	/* Fast check for existence of filters outside of RCU */
> 	if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
> 		return 0;
> 
> 	/*
> 	 * req->opcode has already been validated to be within the range
> 	 * of what we expect, io_init_req() does this.
> 	 */
> 	guard(rcu)();
> 	filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
> 	if (!filter)
> 		return 0;
> 
> 	if (filter == &dummy_filter)
> 		return -EACCES;
> 
> 	io_uring_populate_bpf_ctx(&bpf_ctx, req);
> 
> 	/*
> 	 * Iterate registered filters. The opcode is allowed IFF all filters
> 	 * return 1. If any filter returns denied, opcode will be denied.
> 	 */
> 	for (; filter ; filter = filter->next) {
> 		int ret;
> 
> 		if (filter == &dummy_filter)
> 			return -EACCES;
> 
> 		ret = bpf_prog_run(filter->prog, &bpf_ctx);
> 		if (!ret)
> 			return -EACCES;
> 	}
> 
> 	return 0;
> }

Did a variant based on this, I agree it looks nicer with guard for this
one.

>> +static void io_free_bpf_filters(struct rcu_head *head)
>> +{
>> +	struct io_bpf_filter __rcu **filter;
>> +	struct io_bpf_filters *filters;
>> +	int i;
>> +
>> +	filters = container_of(head, struct io_bpf_filters, rcu_head);
>> +	spin_lock(&filters->lock);
>> +	filter = filters->filters;
>> +	if (!filter) {
>> +		spin_unlock(&filters->lock);
>> +		return;
>> +	}
>> +	spin_unlock(&filters->lock);
> 
> This is minor but I prefer:
> 
> 	scoped_guard(spinlock)(&filters->lock) {
> 		filters = container_of(head, struct io_bpf_filters, rcu_head);
> 		filter = filters->filters;
> 		if (!filter)
> 			return;
> 	}

Reason I tend to never do that is that I always have to grep for the
syntax... And case in point, you need to do:

	scoped_guard(spinlock, &filters->lock) {

so I guess I'm not the only one :-). But it does read better that way,
made this change too.

>> +static struct io_bpf_filters *io_new_bpf_filters(void)
>> +{
>> +	struct io_bpf_filters *filters;
>> +
>> +	filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
>> +	if (!filters)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	filters->filters = kcalloc(IORING_OP_LAST,
>> +				   sizeof(struct io_bpf_filter *),
>> +				   GFP_KERNEL_ACCOUNT);
>> +	if (!filters->filters) {
>> +		kfree(filters);
>> +		return ERR_PTR(-ENOMEM);
>> +	}
>> +
>> +	refcount_set(&filters->refs, 1);
>> +	spin_lock_init(&filters->lock);
>> +	return filters;
>> +}
> 
> static struct io_bpf_filters *io_new_bpf_filters(void)
> {
> 	struct io_bpf_filters *filters __free(kfree) = NULL;
> 
> 	filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
> 	if (!filters)
> 		return ERR_PTR(-ENOMEM);
> 
> 	filters->filters = kcalloc(IORING_OP_LAST,
> 				   sizeof(struct io_bpf_filter *),
> 				   GFP_KERNEL_ACCOUNT);
> 	if (!filters->filters)
> 		return ERR_PTR(-ENOMEM);
> 
> 	refcount_set(&filters->refs, 1);
> 	spin_lock_init(&filters->lock);
> 	return no_free_ptr(filters);
> }

Adopted as well, thanks.

>> +/*
>> + * Validate classic BPF filter instructions. Only allow a safe subset of
>> + * operations - no packet data access, just context field loads and basic
>> + * ALU/jump operations.
>> + */
>> +static int io_uring_check_cbpf_filter(struct sock_filter *filter,
>> +				      unsigned int flen)
>> +{
>> +	int pc;
> 
> Seems fine to me but I can't meaningfully review this.

Yeah seriously... It's just the seccomp filter modified for this use
case, so supposedly should be vetted already.

>> +int io_register_bpf_filter(struct io_restriction *res,
>> +			   struct io_uring_bpf __user *arg)
>> +{
>> +	struct io_bpf_filter *filter, *old_filter;
>> +	struct io_bpf_filters *filters;
>> +	struct io_uring_bpf reg;
>> +	struct bpf_prog *prog;
>> +	struct sock_fprog fprog;
>> +	int ret;
>> +
>> +	if (copy_from_user(&reg, arg, sizeof(reg)))
>> +		return -EFAULT;
>> +	if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
>> +		return -EINVAL;
>> +	if (reg.cmd_flags || reg.resv)
>> +		return -EINVAL;
>> +
>> +	if (reg.filter.opcode >= IORING_OP_LAST)
>> +		return -EINVAL;
> 
> So you only support per-op-code filtering with cbpf. I assume that you
> would argue that people can use the existing io_uring restrictions. But
> that's not inherited, right? So then this forces users to have a bpf
> program for all opcodes that io_uring on their system supports.

The existing restrictions ARE inherited now, and can be set on a
per-task basis as well. That's in the last patch. And since the classic
"deny this opcode" filtering is way cheaper than running a BPF prog, I
do think that's the right approach.

> I think that this is a bit unfortunate and wasteful for both userspace
> and io_uring. Can't we do a combined thing where we also allow filters
> to attach to all op-codes. Then userspace could start with an allow-list
> or deny-list filter and then attach further per-op-code bpf programs to
> the op-codes they want to manage specifically. Then you also get
> inheritance of the restrictions per-task.
> 
> That would be nicer imho.

I'm considering this one moot given the above on using
IORING_REGISTER_RESTRICTIONS with fd == -1 to set per-task restrictions
using the classic non-bpf filtering, which are inherited as well. You
only need the cBPF filters if you want to deny an opcode based on aux
data for that opcode.

-- 
Jens Axboe

  reply	other threads:[~2026-01-27 16:41 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-19 23:54 [PATCHSET v6] Inherited restrictions and BPF filtering for io_uring Jens Axboe
2026-01-19 23:54 ` [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions Jens Axboe
2026-01-27 10:06   ` Christian Brauner
2026-01-27 16:41     ` Jens Axboe [this message]
2026-01-19 23:54 ` [PATCH 2/7] io_uring/net: allow filtering on IORING_OP_SOCKET data Jens Axboe
2026-01-19 23:54 ` [PATCH 3/7] io_uring/bpf_filter: allow filtering on contents of struct open_how Jens Axboe
2026-01-27  9:33   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 4/7] io_uring/bpf_filter: cache lookup table in ctx->bpf_filters Jens Axboe
2026-01-27  9:33   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 5/7] io_uring/bpf_filter: add ref counts to struct io_bpf_filter Jens Axboe
2026-01-27  9:34   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 6/7] io_uring: add task fork hook Jens Axboe
2026-01-27 10:07   ` Christian Brauner
2026-01-19 23:54 ` [PATCH 7/7] io_uring: allow registration of per-task restrictions Jens Axboe
2026-01-22  3:37 ` [PATCHSET v6] Inherited restrictions and BPF filtering for io_uring Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2026-01-27 18:29 [PATCHSET v7] " Jens Axboe
2026-01-27 18:29 ` [PATCH 1/7] io_uring: add support for BPF filtering for opcode restrictions Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=269f6e6a-362f-4ccf-926f-2e2e0c1ea014@kernel.dk \
    --to=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=io-uring@vger.kernel.org \
    --cc=jannh@google.com \
    --cc=kees@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox