public inbox for [email protected]
 help / color / mirror / Atom feed
From: Kanchan Joshi <[email protected]>
To: [email protected], [email protected], [email protected], [email protected]
Cc: [email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected],
	Kanchan Joshi <[email protected]>,
	Anuj Gupta <[email protected]>
Subject: [RFC PATCH 12/12] pci: implement submission/completion for rawq commands
Date: Sat, 29 Apr 2023 15:09:25 +0530	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

Implement ->queue_uring_cmd and ->poll_uring_cmd for leaner submission
and completion. Both operate on "struct io_uring_cmd" directly, without
having to take hoops involving request and bio.

This is currently enabled for small (single segement length), premapped
(fixedbufs) IOs.

Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Anuj Gupta <[email protected]>
---
 drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 30d7a1a6eaab..ca0580c4e977 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -28,6 +28,8 @@
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/sed-opal.h>
 #include <linux/pci-p2pdma.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/io_uring.h>
 
 #include "trace.h"
 #include "nvme.h"
@@ -210,6 +212,7 @@ struct nvme_queue {
 	unsigned long *cmdid_bmp;
 	spinlock_t cmdid_lock;
 	struct nvme_iod *iod;
+	u8	reg_id;
 	 /* only used for poll queues: */
 	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
 	struct nvme_completion *cqes;
@@ -249,7 +252,11 @@ union nvme_descriptor {
  * to the actual struct scatterlist.
  */
 struct nvme_iod {
-	struct nvme_request req;
+	union {
+		struct nvme_request req;
+		/* for raw-queue */
+		struct io_uring_cmd *ioucmd;
+	};
 	struct nvme_command cmd;
 	bool aborted;
 	s8 nr_allocations;	/* PRP list pool allocations. 0 means small
@@ -1025,6 +1032,13 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 }
 
+static void nvme_pci_put_cmdid(struct nvme_queue *nvmeq, int id)
+{
+	spin_lock(&nvmeq->cmdid_lock);
+	clear_bit(id, nvmeq->cmdid_bmp);
+	spin_unlock(&nvmeq->cmdid_lock);
+}
+
 static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 {
 	if (!nvmeq->qid)
@@ -1032,6 +1046,37 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
+static inline struct nvme_uring_direct_pdu *nvme_uring_cmd_direct_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_direct_pdu *)&ioucmd->pdu;
+}
+
+static inline void nvme_handle_cqe_rawq(struct nvme_queue *nvmeq,
+					struct nvme_completion *cqe,
+					__u16 command_id)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	u32 status = le16_to_cpu(cqe->status) >> 1;
+	u64 result = cqe->result.u64;
+	struct nvme_iod *iod;
+	int id, reg_id;
+
+	reg_id = nvme_genctr_from_cid(command_id);
+	/* we should not encounter completions from past registration*/
+	WARN_ON_ONCE(nvmeq->reg_id != reg_id);
+
+	id = nvme_tag_from_cid(command_id);
+	iod = &nvmeq->iod[id];
+
+	if (iod->dma_len)
+		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
+		nvme_is_write(&iod->cmd) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	nvme_pci_put_cmdid(nvmeq, id);
+	io_uring_cmd_done(iod->ioucmd, status, result, IO_URING_F_UNLOCKED);
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1039,6 +1084,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	__u16 command_id = READ_ONCE(cqe->command_id);
 	struct request *req;
 
+	if (test_bit(NVMEQ_RAW, &nvmeq->flags))
+		return nvme_handle_cqe_rawq(nvmeq, cqe, command_id);
+
 	/*
 	 * AEN requests are special as they don't time out and can
 	 * survive any kind of queue freeze and often don't respond to
@@ -1151,6 +1199,25 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	return found;
 }
 
+static int nvme_poll_uring_cmd(struct io_uring_cmd *ioucmd, int qid,
+			       struct io_comp_batch *iob)
+
+{
+	struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd);
+	struct nvme_dev *dev = to_nvme_dev(pdu->ns->ctrl);
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+	bool found;
+
+	if (!nvme_cqe_pending(nvmeq))
+		return 0;
+
+	spin_lock(&nvmeq->cq_poll_lock);
+	found = nvme_poll_cq(nvmeq, iob);
+	spin_unlock(&nvmeq->cq_poll_lock);
+
+	return found;
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -1762,6 +1829,22 @@ static int nvme_pci_alloc_cmdid_bmp(struct nvme_queue *nvmeq)
 	return 0;
 }
 
+static int nvme_pci_get_cmdid(struct nvme_queue *nvmeq)
+{
+	int id = 0, size = nvmeq->q_depth - 1;
+
+	spin_lock(&nvmeq->cmdid_lock);
+	id = find_first_zero_bit(nvmeq->cmdid_bmp, size);
+	if (id >= size) {
+		id = -EBUSY;
+		goto unlock;
+	}
+	set_bit(id, nvmeq->cmdid_bmp);
+unlock:
+	spin_unlock(&nvmeq->cmdid_lock);
+	return id;
+}
+
 static int nvme_pci_alloc_iod_array(struct nvme_queue *nvmeq)
 {
 	if (!test_bit(NVMEQ_RAW, &nvmeq->flags))
@@ -1793,13 +1876,17 @@ static int nvme_pci_register_queue(void *data)
 	struct nvme_ns *ns = (struct nvme_ns *) data;
 	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
 	int qid, ret;
+	struct nvme_queue *nvmeq;
 
 	qid = nvme_pci_get_rawq(dev);
 	if (qid > 0) {
 		/* setup command-id bitmap and iod array */
-		ret = nvme_pci_setup_rawq(&dev->queues[qid]);
+		nvmeq = &dev->queues[qid];
+		ret = nvme_pci_setup_rawq(nvmeq);
 		if (ret < 0)
 			qid = ret;
+		else
+			nvmeq->reg_id++;
 	}
 	return qid;
 }
@@ -1812,6 +1899,107 @@ static int nvme_pci_unregister_queue(void *data, int qid)
 	return nvme_pci_put_rawq(dev, qid);
 }
 
+static int nvme_map_io_fb(struct request_queue *q, struct io_uring_cmd *ioucmd,
+			struct nvme_dev *dev, struct nvme_iod *iod,
+			unsigned long addr, unsigned int buf_len)
+{
+	struct nvme_command *cmnd = &iod->cmd;
+	int ret, rw = nvme_is_write(cmnd);
+	struct iov_iter iter;
+	size_t nr_iter, nr_segs;
+	struct bio_vec *bv;
+
+	if (!(ioucmd->flags & IORING_URING_CMD_FIXED))
+		return -EINVAL;
+
+	ret = io_uring_cmd_import_fixed(addr, buf_len, rw, &iter, ioucmd);
+	if (ret < 0)
+		return ret;
+	nr_iter = iov_iter_count(&iter);
+	nr_segs = iter.nr_segs;
+
+	/* device will do these checks anyway, so why do duplicate work? */
+	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
+		goto err;
+	if (nr_segs > queue_max_segments(q))
+		goto err;
+	/* this will be attempted from regular path instead */
+	if (nr_segs > 1)
+		goto err;
+
+	bv = (struct bio_vec *)&iter.bvec[0];
+	if (bv->bv_offset + bv->bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+		return nvme_setup_prp_simple(dev, iod,
+				(struct nvme_rw_command *)cmnd,
+				bv, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+err:
+	return -EINVAL;
+}
+
+static int nvme_alloc_cmd_from_q(struct nvme_queue *nvmeq, int *cmdid,
+				struct nvme_iod **iod)
+{
+	int id;
+
+	id = nvme_pci_get_cmdid(nvmeq);
+	if (id < 0)
+		return id;
+	*cmdid = id | nvme_cid_install_genctr(nvmeq->reg_id);
+	*iod = &nvmeq->iod[id];
+	return 0;
+}
+
+static int nvme_pci_queue_ucmd(struct io_uring_cmd *ioucmd, int qid)
+{
+	struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd);
+	struct nvme_ns *ns = pdu->ns;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	struct nvme_command *nvme_cmd;
+	struct nvme_uring_data d;
+	int ret, cmdid;
+	struct nvme_iod *iod;
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+
+	ret = nvme_alloc_cmd_from_q(nvmeq, &cmdid, &iod);
+	if (ret)
+		return ret;
+
+	iod->ioucmd = ioucmd;
+	nvme_cmd = &iod->cmd;
+	ret = nvme_prep_cmd_from_ioucmd(ctrl, ns, ioucmd, nvme_cmd, &d);
+	if (ret)
+		goto out;
+
+	nvme_cmd->common.command_id = cmdid;
+	iod->aborted = false;
+	iod->nr_allocations = -1;
+	iod->sgt.nents = 0;
+	ret = nvme_map_io_fb(ns->queue, ioucmd, dev, iod, d.addr, d.data_len);
+	if  (ret)
+		goto out;
+
+	/*
+	 * since this nvmeq is exclusive to single submitter (io_uring
+	 * follows one-thread-per-ring model), we do not need the lock
+	 * ideally. But we have lifetime difference between io_uring
+	 * and nvme sqe. io_uring SQE can be reused just after submission
+	 * but for nvme we have to wait until completion.
+	 * So if we run out of space, submission will be deferred to
+	 * io_uring worker. So we can't skip the lock.
+	 * But not all is lost! Due to one-thread-per-ring model and
+	 * polled-completion, contention is not common in most cases.
+	 */
+	spin_lock(&nvmeq->sq_lock);
+	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
+	nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
+	return -EIOCBQUEUED;
+out:
+	nvme_pci_put_cmdid(nvmeq, cmdid);
+	return ret;
+}
+
 static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
@@ -1832,6 +2020,8 @@ static const struct blk_mq_ops nvme_mq_ops = {
 	.poll		= nvme_poll,
 	.register_queue	= nvme_pci_register_queue,
 	.unregister_queue =  nvme_pci_unregister_queue,
+	.queue_uring_cmd = nvme_pci_queue_ucmd,
+	.poll_uring_cmd	= nvme_poll_uring_cmd,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
-- 
2.25.1


  parent reply	other threads:[~2023-04-29  9:43 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20230429094228epcas5p4a80d8ed77433989fa804ecf449f83b0b@epcas5p4.samsung.com>
2023-04-29  9:39 ` [RFC PATCH 00/12] io_uring attached nvme queue Kanchan Joshi
     [not found]   ` <CGME20230429094238epcas5p4efa3dc785fa54ab974852c7f90113025@epcas5p4.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 01/12] nvme: refactor nvme_alloc_io_tag_set Kanchan Joshi
     [not found]   ` <CGME20230429094240epcas5p1a7411f266412244115411b05da509e4a@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 02/12] pci: enable "raw_queues = N" module parameter Kanchan Joshi
     [not found]   ` <CGME20230429094243epcas5p13be3ca62dc2b03299d09cafaf11923c1@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 03/12] fs, block: interface to register/unregister the raw-queue Kanchan Joshi
     [not found]   ` <CGME20230429094245epcas5p2843abc5cd54ffe301d36459543bcd228@epcas5p2.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 04/12] io_uring, fs: plumb support to register/unregister raw-queue Kanchan Joshi
     [not found]   ` <CGME20230429094247epcas5p333e0f515000de60fb64dc2590cf9fcd8@epcas5p3.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 05/12] nvme: wire-up register/unregister queue f_op callback Kanchan Joshi
     [not found]   ` <CGME20230429094249epcas5p18bd717f4e34077c0fcf28458f11de8d1@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 06/12] pci: implement register/unregister functionality Kanchan Joshi
     [not found]   ` <CGME20230429094251epcas5p144d042853e10f090e3119338c2306546@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 07/12] io_uring: support for using registered queue in uring-cmd Kanchan Joshi
     [not found]   ` <CGME20230429094253epcas5p3cfff90e1c003b6fc9c7c4a61287beecb@epcas5p3.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 08/12] block: add mq_ops to submit and complete commands from raw-queue Kanchan Joshi
     [not found]   ` <CGME20230429094255epcas5p11bcbe76772289f27c41a50ce502c998d@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 09/12] nvme: carve out a helper to prepare nvme_command from ioucmd->cmd Kanchan Joshi
     [not found]   ` <CGME20230429094257epcas5p463574920bba26cd219275e57c2063d85@epcas5p4.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 10/12] nvme: submisssion/completion of uring_cmd to/from the registered queue Kanchan Joshi
     [not found]   ` <CGME20230429094259epcas5p11f0f3422eb4aa4e3ebf00e0666790efa@epcas5p1.samsung.com>
2023-04-29  9:39     ` [RFC PATCH 11/12] pci: modify nvme_setup_prp_simple parameters Kanchan Joshi
     [not found]   ` <CGME20230429094301epcas5p48cf45da2f83d9ca8140ee777c7446d11@epcas5p4.samsung.com>
2023-04-29  9:39     ` Kanchan Joshi [this message]
2023-04-29 17:17   ` [RFC PATCH 00/12] io_uring attached nvme queue Jens Axboe
2023-05-01 11:36     ` Kanchan Joshi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox