public inbox for [email protected]
 help / color / mirror / Atom feed
From: Xiaoguang Wang <[email protected]>
To: [email protected], [email protected],
	[email protected]
Cc: [email protected], [email protected], [email protected],
	[email protected]
Subject: [RFC 3/3] ublk_drv: add ebpf support
Date: Wed, 15 Feb 2023 08:41:22 +0800	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Currenly only one bpf_ublk_queue_sqe() ebpf is added, ublksrv target
can use this helper to write ebpf prog to support ublk kernel & usersapce
zero copy, please see ublksrv test codes for more info.

Signed-off-by: Xiaoguang Wang <[email protected]>
---
 drivers/block/ublk_drv.c       | 207 ++++++++++++++++++++++++++++++++-
 include/uapi/linux/bpf.h       |   1 +
 include/uapi/linux/ublk_cmd.h  |  11 ++
 scripts/bpf_doc.py             |   4 +
 tools/include/uapi/linux/bpf.h |   8 ++
 5 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b628e9eaefa6..44c289b72864 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -61,6 +61,7 @@
 struct ublk_rq_data {
 	struct llist_node node;
 	struct callback_head work;
+	struct io_mapped_kbuf *kbuf;
 };
 
 struct ublk_uring_cmd_pdu {
@@ -163,6 +164,9 @@ struct ublk_device {
 	unsigned int		nr_queues_ready;
 	atomic_t		nr_aborted_queues;
 
+	struct bpf_prog		*io_prep_prog;
+	struct bpf_prog		*io_submit_prog;
+
 	/*
 	 * Our ubq->daemon may be killed without any notification, so
 	 * monitor each queue's daemon periodically
@@ -189,10 +193,46 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
 
 static struct miscdevice ublk_misc;
 
+struct ublk_io_bpf_ctx {
+	struct ublk_bpf_ctx ctx;
+	struct ublk_device *ub;
+	struct callback_head work;
+};
+
+BPF_CALL_4(bpf_ublk_queue_sqe, struct ublk_io_bpf_ctx *, bpf_ctx,
+	   struct io_uring_sqe *, sqe, u32, sqe_len, u32, fd)
+{
+	struct request *rq;
+	struct ublk_rq_data *data;
+	struct io_mapped_kbuf *kbuf;
+	u16 q_id = bpf_ctx->ctx.q_id;
+	u16 tag = bpf_ctx->ctx.tag;
+
+	rq = blk_mq_tag_to_rq(bpf_ctx->ub->tag_set.tags[q_id], tag);
+	data = blk_mq_rq_to_pdu(rq);
+	kbuf = data->kbuf;
+	io_uring_submit_sqe(fd, sqe, sqe_len, kbuf);
+	return 0;
+}
+
+const struct bpf_func_proto ublk_bpf_queue_sqe_proto = {
+	.func = bpf_ublk_queue_sqe,
+	.gpl_only = false,
+	.ret_type = RET_INTEGER,
+	.arg1_type = ARG_ANYTHING,
+	.arg2_type = ARG_ANYTHING,
+	.arg3_type = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 ublk_bpf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-	return bpf_base_func_proto(func_id);
+	switch (func_id) {
+	case BPF_FUNC_ublk_queue_sqe:
+		return &ublk_bpf_queue_sqe_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
 }
 
 static bool ublk_bpf_is_valid_access(int off, int size,
@@ -200,6 +240,23 @@ static bool ublk_bpf_is_valid_access(int off, int size,
 			const struct bpf_prog *prog,
 			struct bpf_insn_access_aux *info)
 {
+	if (off < 0 || off >= sizeof(struct ublk_bpf_ctx))
+		return false;
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case offsetof(struct ublk_bpf_ctx, q_id):
+		return size == sizeof_field(struct ublk_bpf_ctx, q_id);
+	case offsetof(struct ublk_bpf_ctx, tag):
+		return size == sizeof_field(struct ublk_bpf_ctx, tag);
+	case offsetof(struct ublk_bpf_ctx, op):
+		return size == sizeof_field(struct ublk_bpf_ctx, op);
+	case offsetof(struct ublk_bpf_ctx, nr_sectors):
+		return size == sizeof_field(struct ublk_bpf_ctx, nr_sectors);
+	case offsetof(struct ublk_bpf_ctx, start_sector):
+		return size == sizeof_field(struct ublk_bpf_ctx, start_sector);
+	}
 	return false;
 }
 
@@ -324,7 +381,7 @@ static void ublk_put_device(struct ublk_device *ub)
 static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
 		int qid)
 {
-       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+	return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
 }
 
 static inline bool ublk_rq_has_data(const struct request *rq)
@@ -492,12 +549,16 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data,
 static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
 		struct ublk_io *io)
 {
+	struct ublk_device *ub = ubq->dev;
 	const unsigned int rq_bytes = blk_rq_bytes(req);
 	/*
 	 * no zero copy, we delay copy WRITE request data into ublksrv
 	 * context and the big benefit is that pinning pages in current
 	 * context is pretty fast, see ublk_pin_user_pages
 	 */
+	if ((req_op(req) == REQ_OP_WRITE) && ub->io_prep_prog)
+		return rq_bytes;
+
 	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
 		return rq_bytes;
 
@@ -860,6 +921,89 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
 	}
 }
 
+static void ublk_bpf_io_submit_fn(struct callback_head *work)
+{
+	struct ublk_io_bpf_ctx *bpf_ctx = container_of(work,
+			struct ublk_io_bpf_ctx, work);
+
+	if (bpf_ctx->ub->io_submit_prog)
+		bpf_prog_run_pin_on_cpu(bpf_ctx->ub->io_submit_prog, bpf_ctx);
+	kfree(bpf_ctx);
+}
+
+static int ublk_init_uring_kbuf(struct request *rq)
+{
+	struct bio_vec *bvec;
+	struct req_iterator rq_iter;
+	struct bio_vec tmp;
+	int nr_bvec = 0;
+	struct io_mapped_kbuf *kbuf;
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+
+	/* Drop previous allocation */
+	if (data->kbuf) {
+		kfree(data->kbuf->bvec);
+		kfree(data->kbuf);
+		data->kbuf = NULL;
+	}
+
+	kbuf = kmalloc(sizeof(struct io_mapped_kbuf), GFP_NOIO);
+	if (!kbuf)
+		return -EIO;
+
+	rq_for_each_bvec(tmp, rq, rq_iter)
+		nr_bvec++;
+
+	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec), GFP_NOIO);
+	if (!bvec) {
+		kfree(kbuf);
+		return -EIO;
+	}
+	kbuf->bvec = bvec;
+	rq_for_each_bvec(tmp, rq, rq_iter) {
+		*bvec = tmp;
+		bvec++;
+	}
+
+	kbuf->count = blk_rq_bytes(rq);
+	kbuf->nr_bvecs = nr_bvec;
+	data->kbuf = kbuf;
+	return 0;
+}
+
+static int ublk_run_bpf_prog(struct ublk_queue *ubq, struct request *rq)
+{
+	int err;
+	struct ublk_device *ub = ubq->dev;
+	struct bpf_prog *prog = ub->io_prep_prog;
+	struct ublk_io_bpf_ctx *bpf_ctx;
+
+	if (!prog)
+		return 0;
+
+	bpf_ctx = kmalloc(sizeof(struct ublk_io_bpf_ctx), GFP_NOIO);
+	if (!bpf_ctx)
+		return -EIO;
+
+	err = ublk_init_uring_kbuf(rq);
+	if (err < 0) {
+		kfree(bpf_ctx);
+		return -EIO;
+	}
+	bpf_ctx->ub = ub;
+	bpf_ctx->ctx.q_id = ubq->q_id;
+	bpf_ctx->ctx.tag = rq->tag;
+	bpf_ctx->ctx.op = req_op(rq);
+	bpf_ctx->ctx.nr_sectors = blk_rq_sectors(rq);
+	bpf_ctx->ctx.start_sector = blk_rq_pos(rq);
+	bpf_prog_run_pin_on_cpu(prog, bpf_ctx);
+
+	init_task_work(&bpf_ctx->work, ublk_bpf_io_submit_fn);
+	if (task_work_add(ubq->ubq_daemon, &bpf_ctx->work, TWA_SIGNAL_NO_IPI))
+		kfree(bpf_ctx);
+	return 0;
+}
+
 static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
@@ -872,6 +1016,9 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(res != BLK_STS_OK))
 		return BLK_STS_IOERR;
 
+	/* Currently just for test. */
+	ublk_run_bpf_prog(ubq, rq);
+
 	/* With recovery feature enabled, force_abort is set in
 	 * ublk_stop_dev() before calling del_gendisk(). We have to
 	 * abort all requeued and new rqs here to let del_gendisk()
@@ -2009,6 +2156,56 @@ static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
 	return ret;
 }
 
+static int ublk_ctrl_reg_bpf_prog(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	prog = bpf_prog_get_type(header->data[0], BPF_PROG_TYPE_UBLK);
+	if (IS_ERR(prog)) {
+		ret = PTR_ERR(prog);
+		goto out_unlock;
+	}
+	ub->io_prep_prog = prog;
+
+	prog = bpf_prog_get_type(header->data[1], BPF_PROG_TYPE_UBLK);
+	if (IS_ERR(prog)) {
+		ret = PTR_ERR(prog);
+		goto out_unlock;
+	}
+	ub->io_submit_prog = prog;
+
+out_unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return ret;
+}
+
+static int ublk_ctrl_unreg_bpf_prog(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub;
+
+	ub = ublk_get_device_from_id(header->dev_id);
+	if (!ub)
+		return -EINVAL;
+
+	mutex_lock(&ub->mutex);
+	bpf_prog_put(ub->io_prep_prog);
+	bpf_prog_put(ub->io_submit_prog);
+	ub->io_prep_prog = NULL;
+	ub->io_submit_prog = NULL;
+	mutex_unlock(&ub->mutex);
+	ublk_put_device(ub);
+	return 0;
+}
 static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 		unsigned int issue_flags)
 {
@@ -2059,6 +2256,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_END_USER_RECOVERY:
 		ret = ublk_ctrl_end_recovery(cmd);
 		break;
+	case UBLK_CMD_REG_BPF_PROG:
+		ret = ublk_ctrl_reg_bpf_prog(cmd);
+		break;
+	case UBLK_CMD_UNREG_BPF_PROG:
+		ret = ublk_ctrl_unreg_bpf_prog(cmd);
+		break;
 	default:
 		break;
 	}
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 515b7b995b3a..578d65e9f30e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5699,6 +5699,7 @@ union bpf_attr {
 	FN(user_ringbuf_drain, 209, ##ctx)		\
 	FN(cgrp_storage_get, 210, ##ctx)		\
 	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(ublk_queue_sqe, 212, ##ctx)			\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 8f88e3a29998..a43b1864de51 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -17,6 +17,8 @@
 #define	UBLK_CMD_STOP_DEV	0x07
 #define	UBLK_CMD_SET_PARAMS	0x08
 #define	UBLK_CMD_GET_PARAMS	0x09
+#define UBLK_CMD_REG_BPF_PROG		0x0a
+#define UBLK_CMD_UNREG_BPF_PROG		0x0b
 #define	UBLK_CMD_START_USER_RECOVERY	0x10
 #define	UBLK_CMD_END_USER_RECOVERY	0x11
 /*
@@ -230,4 +232,13 @@ struct ublk_params {
 	struct ublk_param_discard	discard;
 };
 
+struct ublk_bpf_ctx {
+	__u32	t_val;
+	__u16	q_id;
+	__u16	tag;
+	__u8	op;
+	__u32	nr_sectors;
+	__u64	start_sector;
+};
+
 #endif
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index e8d90829f23e..f8672294e145 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -700,6 +700,8 @@ class PrinterHelpers(Printer):
             'struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct ublk_io_bpf_ctx',
+            'struct io_uring_sqe',
     ]
     known_types = {
             '...',
@@ -755,6 +757,8 @@ class PrinterHelpers(Printer):
             'const struct bpf_dynptr',
             'struct iphdr',
             'struct ipv6hdr',
+            'struct ublk_io_bpf_ctx',
+            'struct io_uring_sqe',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 515b7b995b3a..530094246e2a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5485,6 +5485,13 @@ union bpf_attr {
  *		0 on success.
  *
  *		**-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * u64 bpf_ublk_queue_sqe(struct ublk_io_bpf_ctx *ctx, struct io_uring_sqe *sqe, u32 offset, u32 len)
+ *	Description
+ *		Submit ublk io requests.
+ *	Return
+ *		0 on success.
+ *
  */
 #define ___BPF_FUNC_MAPPER(FN, ctx...)			\
 	FN(unspec, 0, ##ctx)				\
@@ -5699,6 +5706,7 @@ union bpf_attr {
 	FN(user_ringbuf_drain, 209, ##ctx)		\
 	FN(cgrp_storage_get, 210, ##ctx)		\
 	FN(cgrp_storage_delete, 211, ##ctx)		\
+	FN(ublk_queue_sqe, 212, ##ctx)			\
 	/* */
 
 /* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
-- 
2.31.1


  parent reply	other threads:[~2023-02-15  0:41 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-15  0:41 [RFC 0/3] Add io_uring & ebpf based methods to implement zero-copy for ublk Xiaoguang Wang
2023-02-15  0:41 ` [RFC 1/3] bpf: add UBLK program type Xiaoguang Wang
2023-02-15  0:41 ` [RFC 2/3] io_uring: enable io_uring to submit sqes located in kernel Xiaoguang Wang
2023-02-15  0:41 ` Xiaoguang Wang [this message]
2023-02-16  8:11   ` [RFC 3/3] ublk_drv: add ebpf support Ming Lei
2023-02-16 12:12     ` Xiaoguang Wang
2023-02-17  3:02       ` Ming Lei
2023-02-17 10:46         ` Ming Lei
2023-02-22 14:13         ` Xiaoguang Wang
2023-02-15  0:46 ` [UBLKSRV] Add " Xiaoguang Wang
2023-02-16  8:28   ` Ming Lei
2023-02-16  9:17     ` Xiaoguang Wang
2023-02-15  8:40 ` [RFC 0/3] Add io_uring & ebpf based methods to implement zero-copy for ublk Ziyang Zhang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230215004122.28917-4-xiaoguang.wang@linux.alibaba.com \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox