public inbox for [email protected]
 help / color / mirror / Atom feed
From: Kanchan Joshi <[email protected]>
To: [email protected], [email protected], [email protected], [email protected]
Cc: [email protected], [email protected],
	[email protected], [email protected],
	[email protected], [email protected], [email protected],
	Kanchan Joshi <[email protected]>
Subject: [PATCH for-next 4/4] nvme-multipath: add multipathing for uring-passthrough commands
Date: Mon, 11 Jul 2022 16:31:55 +0530	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

From: Anuj Gupta <[email protected]>

This heavily reuses nvme-mpath infrastructure for block-path.
Block-path operates on bio, and uses that as long-lived object across
requeue attempts. For passthrough interface io_uring_cmd happens to
be such object, so add a requeue list for that.

If path is not available, uring-cmds are added into that list and
resubmitted on discovery of new path. Existing requeue handler
(nvme_requeue_work) is extended to do this resubmission.

For failed commands, failover is done directly (i.e. without first
queuing into list) by resubmitting individual command from task-work
based callback.

Suggested-by: Sagi Grimberg <[email protected]>
Co-developed-by: Kanchan Joshi <[email protected]>
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Anuj Gupta <[email protected]>
---
 drivers/nvme/host/ioctl.c     | 141 ++++++++++++++++++++++++++++++++--
 drivers/nvme/host/multipath.c |  36 ++++++++-
 drivers/nvme/host/nvme.h      |  12 +++
 include/linux/io_uring.h      |   2 +
 4 files changed, 182 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index fc02eddd4977..281c21d3c67d 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -340,12 +340,6 @@ struct nvme_uring_data {
 	__u32	timeout_ms;
 };
 
-static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
-		struct io_uring_cmd *ioucmd)
-{
-	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
-}
-
 static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
 {
 	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
@@ -448,6 +442,14 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
 	pdu->meta_len = d.metadata_len;
 
+	if (issue_flags & IO_URING_F_MPATH) {
+		req->cmd_flags |= REQ_NVME_MPATH;
+		/*
+		 * we would need the buffer address (d.addr field) if we have
+		 * to retry the command. Store it by repurposing ioucmd->cmd
+		 */
+		ioucmd->cmd = (void *)d.addr;
+	}
 	blk_execute_rq_nowait(req, false);
 	return -EIOCBQUEUED;
 }
@@ -665,12 +667,135 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 	int srcu_idx = srcu_read_lock(&head->srcu);
 	struct nvme_ns *ns = nvme_find_path(head);
 	int ret = -EINVAL;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		ret = nvme_ns_uring_cmd(ns, ioucmd,
+				issue_flags | IO_URING_F_MPATH);
+	} else if (nvme_available_path(head)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct nvme_uring_cmd *payload = NULL;
+
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		/*
+		 * We may requeue two kinds of uring commands:
+		 * 1. command failed post submission. pdu->req will be non-null
+		 * for this
+		 * 2. command that could not be submitted because path was not
+		 * available. For this pdu->req is set to NULL.
+		 */
+		pdu->req = NULL;
+		/*
+		 * passthrough command will not be available during retry as it
+		 * is embedded in io_uring's SQE. Hence we allocate/copy here
+		 */
+		payload = kmalloc(sizeof(struct nvme_uring_cmd), GFP_KERNEL);
+		if (!payload) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		memcpy(payload, ioucmd->cmd, sizeof(struct nvme_uring_cmd));
+		ioucmd->cmd = payload;
 
-	if (ns)
-		ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+		ret = -EIOCBQUEUED;
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+	}
+out_unlock:
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return ret;
 }
+
+int nvme_uring_cmd_io_retry(struct nvme_ns *ns, struct request *oreq,
+		struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
+	struct nvme_uring_data d;
+	struct nvme_command c;
+	struct request *req;
+	struct bio *obio = oreq->bio;
+	void *meta = NULL;
+
+	memcpy(&c, nvme_req(oreq)->cmd, sizeof(struct nvme_command));
+	d.metadata = (__u64)pdu->meta_buffer;
+	d.metadata_len = pdu->meta_len;
+	d.timeout_ms = oreq->timeout;
+	d.addr = (__u64)ioucmd->cmd;
+	if (obio) {
+		d.data_len = obio->bi_iter.bi_size;
+		blk_rq_unmap_user(obio);
+	} else {
+		d.data_len = 0;
+	}
+	blk_mq_free_request(oreq);
+	req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
+			d.data_len, nvme_to_user_ptr(d.metadata),
+			d.metadata_len, 0, &meta, d.timeout_ms ?
+			msecs_to_jiffies(d.timeout_ms) : 0,
+			ioucmd->cmd_op == NVME_URING_CMD_IO_VEC, 0, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->end_io = nvme_uring_cmd_end_io;
+	req->end_io_data = ioucmd;
+	pdu->bio = req->bio;
+	pdu->meta = meta;
+	req->cmd_flags |= REQ_NVME_MPATH;
+	blk_execute_rq_nowait(req, false);
+	return -EIOCBQUEUED;
+}
+
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd)
+{
+	struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
+	struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head,
+			cdev);
+	int srcu_idx = srcu_read_lock(&head->srcu);
+	struct nvme_ns *ns = nvme_find_path(head);
+	unsigned int issue_flags = IO_URING_F_SQE128 | IO_URING_F_CQE32 |
+		IO_URING_F_MPATH;
+	struct device *dev = &head->cdev_device;
+
+	if (likely(ns)) {
+		struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+		struct request *oreq = pdu->req;
+		int ret;
+
+		if (oreq == NULL) {
+			/*
+			 * this was not submitted (to device) earlier. For this
+			 * ioucmd->cmd points to persistent memory. Free that
+			 * up post submission
+			 */
+			const void *cmd = ioucmd->cmd;
+
+			ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+			kfree(cmd);
+		} else {
+			/*
+			 * this was submitted (to device) earlier. Use old
+			 * request, bio (if it exists) and nvme-pdu to recreate
+			 * the command for the discovered path
+			 */
+			ret = nvme_uring_cmd_io_retry(ns, oreq, ioucmd, pdu);
+		}
+		if (ret != -EIOCBQUEUED)
+			io_uring_cmd_done(ioucmd, ret, 0);
+	} else if (nvme_available_path(head)) {
+		dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+		spin_lock_irq(&head->requeue_ioucmd_lock);
+		ioucmd_list_add(&head->requeue_ioucmd_list, ioucmd);
+		spin_unlock_irq(&head->requeue_ioucmd_lock);
+	} else {
+		dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+		io_uring_cmd_done(ioucmd, -EINVAL, 0);
+	}
+	srcu_read_unlock(&head->srcu, srcu_idx);
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f26640ccb955..fe5655d98c36 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -6,6 +6,7 @@
 #include <linux/backing-dev.h>
 #include <linux/moduleparam.h>
 #include <linux/vmalloc.h>
+#include <linux/io_uring.h>
 #include <trace/events/block.h>
 #include "nvme.h"
 
@@ -80,6 +81,17 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
+static void nvme_ioucmd_failover_req(struct request *req, struct nvme_ns *ns)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+	struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
+
+	/* store the request, to reconstruct the command during retry */
+	pdu->req = req;
+	/* retry in original submitter context */
+	io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -99,6 +111,11 @@ void nvme_failover_req(struct request *req)
 		queue_work(nvme_wq, &ns->ctrl->ana_work);
 	}
 
+	if (blk_rq_is_passthrough(req)) {
+		nvme_ioucmd_failover_req(req, ns);
+		return;
+	}
+
 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
 	for (bio = req->bio; bio; bio = bio->bi_next) {
 		bio_set_dev(bio, ns->head->disk->part0);
@@ -314,7 +331,7 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
-static bool nvme_available_path(struct nvme_ns_head *head)
+bool nvme_available_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns;
 
@@ -459,7 +476,9 @@ static void nvme_requeue_work(struct work_struct *work)
 	struct nvme_ns_head *head =
 		container_of(work, struct nvme_ns_head, requeue_work);
 	struct bio *bio, *next;
+	struct io_uring_cmd *ioucmd, *ioucmd_next;
 
+	/* process requeued bios*/
 	spin_lock_irq(&head->requeue_lock);
 	next = bio_list_get(&head->requeue_list);
 	spin_unlock_irq(&head->requeue_lock);
@@ -470,6 +489,21 @@ static void nvme_requeue_work(struct work_struct *work)
 
 		submit_bio_noacct(bio);
 	}
+
+	/* process requeued passthrough-commands */
+	spin_lock_irq(&head->requeue_ioucmd_lock);
+	ioucmd_next = ioucmd_list_get(&head->requeue_ioucmd_list);
+	spin_unlock_irq(&head->requeue_ioucmd_lock);
+
+	while ((ioucmd = ioucmd_next) != NULL) {
+		ioucmd_next = ioucmd->next;
+		ioucmd->next = NULL;
+		/*
+		 * Retry in original submitter context as we would be
+		 * processing the passthrough command again
+		 */
+		io_uring_cmd_execute_in_task(ioucmd, nvme_ioucmd_mpath_retry);
+	}
 }
 
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9d3ff6feda06..125b48e74e72 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -16,6 +16,7 @@
 #include <linux/rcupdate.h>
 #include <linux/wait.h>
 #include <linux/t10-pi.h>
+#include <linux/io_uring.h>
 
 #include <trace/events/block.h>
 
@@ -189,6 +190,12 @@ enum {
 	NVME_REQ_USERCMD		= (1 << 1),
 };
 
+static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
 static inline struct nvme_request *nvme_req(struct request *req)
 {
 	return blk_mq_rq_to_pdu(req);
@@ -442,6 +449,9 @@ struct nvme_ns_head {
 	struct work_struct	requeue_work;
 	struct mutex		lock;
 	unsigned long		flags;
+	/* for uring-passthru multipath handling */
+	struct ioucmd_list	requeue_ioucmd_list;
+	spinlock_t		requeue_ioucmd_lock;
 #define NVME_NSHEAD_DISK_LIVE	0
 	struct nvme_ns __rcu	*current_path[];
 #endif
@@ -830,6 +840,7 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
+void nvme_ioucmd_mpath_retry(struct io_uring_cmd *ioucmd);
 int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 
@@ -844,6 +855,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 	return ctrl->ana_log_buf != NULL;
 }
 
+bool nvme_available_path(struct nvme_ns_head *head);
 void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index d734599cbcd7..57f4dfc83316 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -15,6 +15,8 @@ enum io_uring_cmd_flags {
 	IO_URING_F_SQE128		= 4,
 	IO_URING_F_CQE32		= 8,
 	IO_URING_F_IOPOLL		= 16,
+	/* to indicate that it is a MPATH req*/
+	IO_URING_F_MPATH		= 32,
 };
 
 struct io_uring_cmd {
-- 
2.25.1


  parent reply	other threads:[~2022-07-11 11:28 UTC|newest]

Thread overview: 52+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20220711110753epcas5p4169b9e288d15ca35740dbb66a6f6983a@epcas5p4.samsung.com>
2022-07-11 11:01 ` [PATCH for-next 0/4] nvme-multipathing for uring-passthrough Kanchan Joshi
     [not found]   ` <CGME20220711110800epcas5p3d338dd486fd778c5ba5bfe93a91ec8bd@epcas5p3.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 1/4] io_uring, nvme: rename a function Kanchan Joshi
2022-07-14 13:55       ` Ming Lei
     [not found]   ` <CGME20220711110812epcas5p33aa90b23aa62fb11722aa8195754becf@epcas5p3.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 2/4] nvme: compact nvme_uring_cmd_pdu struct Kanchan Joshi
2022-07-12  6:32       ` Christoph Hellwig
     [not found]   ` <CGME20220711110824epcas5p22c8e945cb8c3c3ac46c8c2b5ab55db9b@epcas5p2.samsung.com>
2022-07-11 11:01     ` [PATCH for-next 3/4] io_uring: grow a field in struct io_uring_cmd Kanchan Joshi
2022-07-11 17:00       ` Sagi Grimberg
2022-07-11 17:19         ` Jens Axboe
2022-07-11 17:18       ` Jens Axboe
2022-07-11 17:55         ` Sagi Grimberg
2022-07-11 18:22           ` Sagi Grimberg
2022-07-11 18:24             ` Jens Axboe
2022-07-11 18:58               ` Sagi Grimberg
2022-07-12 11:40               ` Kanchan Joshi
2022-07-14  3:40             ` Ming Lei
2022-07-14  8:19               ` Kanchan Joshi
2022-07-14 15:30                 ` Daniel Wagner
2022-07-15 11:07                   ` Kanchan Joshi
2022-07-18  9:03                     ` Daniel Wagner
     [not found]   ` <CGME20220711110827epcas5p3fd81f142f55ca3048abc38a9ef0d0089@epcas5p3.samsung.com>
2022-07-11 11:01     ` Kanchan Joshi [this message]
2022-07-11 13:51       ` [PATCH for-next 4/4] nvme-multipath: add multipathing for uring-passthrough commands Sagi Grimberg
2022-07-11 15:12         ` Stefan Metzmacher
2022-07-11 16:58           ` Sagi Grimberg
2022-07-11 18:54           ` Kanchan Joshi
2022-07-11 18:37         ` Kanchan Joshi
2022-07-11 19:56           ` Sagi Grimberg
2022-07-12  4:23             ` Kanchan Joshi
2022-07-12 21:26               ` Sagi Grimberg
2022-07-13  5:37                 ` Kanchan Joshi
2022-07-13  9:03                   ` Sagi Grimberg
2022-07-13 11:28                     ` Kanchan Joshi
2022-07-13 12:17                       ` Sagi Grimberg
2022-07-14 15:14                   ` Ming Lei
2022-07-14 23:05                     ` Kanchan Joshi
2022-07-15  1:35                       ` Ming Lei
2022-07-15  1:46                         ` Ming Lei
2022-07-15  4:24                           ` Kanchan Joshi
2022-07-12  6:52       ` Christoph Hellwig
2022-07-12 11:33         ` Kanchan Joshi
2022-07-12 20:13         ` Sagi Grimberg
2022-07-13  5:36           ` Christoph Hellwig
2022-07-13  8:04             ` Sagi Grimberg
2022-07-13 10:12               ` Christoph Hellwig
2022-07-13 11:00                 ` Sagi Grimberg
2022-07-13 11:28                   ` Christoph Hellwig
2022-07-13 12:16                     ` Sagi Grimberg
2022-07-13 11:49                   ` Hannes Reinecke
2022-07-13 12:43                     ` Sagi Grimberg
2022-07-13 13:30                       ` Hannes Reinecke
2022-07-13 13:41                         ` Sagi Grimberg
2022-07-13 14:07                           ` Hannes Reinecke
2022-07-13 15:59                             ` Sagi Grimberg

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox