From: Keith Busch <[email protected]>
To: <[email protected]>, <[email protected]>,
<[email protected]>, <[email protected]>
Cc: <[email protected]>, <[email protected]>,
Alexander Viro <[email protected]>,
Keith Busch <[email protected]>
Subject: [PATCH 5/5] nvme-pci: implement dma_map support
Date: Tue, 26 Jul 2022 10:38:14 -0700 [thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
From: Keith Busch <[email protected]>
Implement callbacks to convert a registered bio_vec to a prp list, and
use this for each IO that uses the returned tag. This saves repeated IO
conversions and dma mapping/unmapping. In many cases, the driver can
skip per-IO pool allocations entirely, saving potentially signficant CPU
cycles.
Signed-off-by: Keith Busch <[email protected]>
---
drivers/nvme/host/pci.c | 291 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 283 insertions(+), 8 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 644664098ae7..571d955eaef0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -110,6 +110,14 @@ struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+struct nvme_dma_mapping {
+ int nr_pages;
+ u16 offset;
+ u8 rsvd[2];
+ dma_addr_t prp_dma_addr;
+ __le64 *prps;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -544,6 +552,35 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
return true;
}
+static void nvme_sync_dma(struct nvme_dev *dev, struct request *req)
+{
+ int index, offset, i, length, nprps;
+ struct nvme_dma_mapping *mapping;
+ bool needs_sync;
+
+ mapping = blk_rq_dma_tag(req);
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ index = offset >> NVME_CTRL_PAGE_SHIFT;
+ needs_sync = rq_data_dir(req) == READ &&
+ dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index]));
+
+ if (!needs_sync)
+ return;
+
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[index++]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE);
+ for (i = 1; i < nprps; i++) {
+ dma_sync_single_for_cpu(dev->dev,
+ le64_to_cpu(mapping->prps[index++]),
+ NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE);
+ }
+}
+
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
@@ -576,6 +613,21 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
}
}
+static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req,
+ struct nvme_iod *iod)
+{
+ if (iod->npages < 0)
+ return;
+
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ iod->first_dma);
+ else if (iod->use_sgl)
+ nvme_free_sgls(dev, req);
+ else
+ nvme_free_prps(dev, req);
+}
+
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -595,18 +647,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
rq_dma_dir(req));
return;
+ } else if (blk_rq_dma_tag(req)) {
+ nvme_sync_dma(dev, req);
+ nvme_free_prp_chain(dev, req, iod);
+ return;
}
WARN_ON_ONCE(!iod->nents);
-
nvme_unmap_sg(dev, req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- iod->first_dma);
- else if (iod->use_sgl)
- nvme_free_sgls(dev, req);
- else
- nvme_free_prps(dev, req);
+ nvme_free_prp_chain(dev, req, iod);
mempool_free(iod->sg, dev->iod_mempool);
}
@@ -835,6 +884,122 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
return BLK_STS_OK;
}
+static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
+ struct nvme_rw_command *cmnd,
+ struct nvme_iod *iod)
+{
+ static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
+ dma_addr_t prp_list_start, prp_list_end, prp_dma;
+ int index, offset, i, length, nprps, nprps_left;
+ void **list = nvme_pci_iod_list(req);
+ struct nvme_dma_mapping *mapping;
+ struct dma_pool *pool;
+ __le64 *prp_list;
+ bool needs_sync;
+
+ mapping = blk_rq_dma_tag(req);
+ offset = blk_rq_dma_offset(req) + mapping->offset;
+ index = offset >> NVME_CTRL_PAGE_SHIFT;
+ offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
+ needs_sync = rq_data_dir(req) == WRITE &&
+ dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index]));
+
+ /*
+ * XXX: For PAGE_SIZE > NVME_CTRL_PAGE_SIZE, is it faster to save the
+ * PRP list implementation and sync multiple partial pages, more
+ * efficient to sync PAGE_SIZE and build the PRP list per-IO from a
+ * host PAGE_SIZE representation, or cleverly sync physically
+ * contiguous regions?
+ */
+ if (needs_sync) {
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[index]),
+ NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
+ }
+
+ length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
+ cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[index++]) + offset);
+
+ if (length <= 0)
+ return BLK_STS_OK;
+
+ if (length <= NVME_CTRL_PAGE_SIZE) {
+ if (needs_sync)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[index]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ cmnd->dptr.prp2 = mapping->prps[index];
+ return BLK_STS_OK;
+ }
+
+ nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+ prp_list_start = mapping->prp_dma_addr + 8 * index;
+ prp_list_end = prp_list_start + 8 * nprps;
+
+ /* Optimization when remaining list fits in one nvme page */
+ if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) ==
+ (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) {
+ cmnd->dptr.prp2 = cpu_to_le64(prp_list_start);
+ goto sync;
+ }
+
+ if (nprps <= (256 / 8)) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+ if (!prp_list) {
+ iod->npages = -1;
+ return BLK_STS_RESOURCE;
+ }
+
+ list[0] = prp_list;
+ iod->first_dma = prp_dma;
+ i = 0;
+ for (;;) {
+ dma_addr_t next_prp_dma;
+ __le64 *next_prp_list;
+
+ if (nprps_left <= last_prp + 1) {
+ memcpy(prp_list, &mapping->prps[index], nprps_left * 8);
+ break;
+ }
+
+ memcpy(prp_list, &mapping->prps[index],
+ NVME_CTRL_PAGE_SIZE - 8);
+ nprps_left -= last_prp;
+ index += last_prp;
+
+ next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma);
+ if (!next_prp_list)
+ goto free_prps;
+
+ prp_list[last_prp] = cpu_to_le64(next_prp_dma);
+ prp_list = next_prp_list;
+ prp_dma = next_prp_dma;
+ list[iod->npages++] = prp_list;
+ }
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+
+sync:
+ if (!needs_sync)
+ return BLK_STS_OK;
+
+ for (i = 0; i < nprps; i++)
+ dma_sync_single_for_device(dev->dev,
+ le64_to_cpu(mapping->prps[index++]),
+ NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
+ return BLK_STS_OK;
+
+free_prps:
+ nvme_free_prps(dev, req);
+ return BLK_STS_RESOURCE;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
@@ -842,6 +1007,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
+ if (blk_rq_dma_tag(req)) {
+ iod->dma_len = 0;
+ iod->use_sgl = false;
+ return nvme_premapped(dev, req, &cmnd->rw, iod);
+ }
+
if (blk_rq_nr_phys_segments(req) == 1) {
struct bio_vec bv = req_bvec(req);
@@ -1732,6 +1903,106 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
return result;
}
+#ifdef CONFIG_HAS_DMA
+/*
+ * Important: bvec must be describing a virtually contiguous buffer.
+ */
+static void *nvme_pci_dma_map(struct request_queue *q,
+ struct bio_vec *bvec, int nr_vecs)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping;
+ int i, j, k, size, ret = -ENOMEM;
+
+ if (!nr_vecs)
+ return ERR_PTR(-EINVAL);
+
+ mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ mapping->nr_pages = nr_vecs * nvme_pages;
+ size = sizeof(*mapping->prps) * mapping->nr_pages;
+ mapping->prps = dma_alloc_coherent(dev->dev, size,
+ &mapping->prp_dma_addr, GFP_KERNEL);
+ if (!mapping->prps)
+ goto free_mapping;
+
+ for (i = 0, k = 0; i < nr_vecs; i++) {
+ struct bio_vec *bv = bvec + i;
+ int pages_per = nvme_pages;
+ dma_addr_t dma_addr;
+
+ if (i == 0) {
+ mapping->offset = bv->bv_offset;
+ pages_per -= mapping->offset >> NVME_CTRL_PAGE_SHIFT;
+ } else if (bv->bv_offset) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ if (bv->bv_offset + bv->bv_len != PAGE_SIZE &&
+ i < nr_vecs - 1) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ dma_addr = dma_map_bvec(dev->dev, bv, 0, 0);
+ if (dma_mapping_error(dev->dev, dma_addr)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (i == 0)
+ dma_addr -= mapping->offset;
+
+ for (j = 0; j < nvme_pages; j++)
+ mapping->prps[k++] = cpu_to_le64(dma_addr +
+ j * NVME_CTRL_PAGE_SIZE);
+ }
+
+ get_device(dev->dev);
+ return mapping;
+
+err:
+ for (i = 0; i < k; i += nvme_pages) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, size, (void *)mapping->prps,
+ mapping->prp_dma_addr);
+free_mapping:
+ kfree(mapping);
+ return ERR_PTR(ret);
+}
+
+static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag)
+{
+ const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE);
+ struct nvme_ns *ns = q->queuedata;
+ struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
+ struct nvme_dma_mapping *mapping = dma_tag;
+ int i;
+
+ for (i = 0; i < mapping->nr_pages; i += nvme_pages) {
+ __u64 dma_addr = le64_to_cpu(mapping->prps[i]);
+
+ dma_unmap_page(dev->dev, dma_addr,
+ PAGE_SIZE - offset_in_page(dma_addr), 0);
+ }
+
+ dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps),
+ (void *)mapping->prps, mapping->prp_dma_addr);
+ kfree(mapping);
+ put_device(dev->dev);
+}
+#endif
+
static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
@@ -1750,6 +2021,10 @@ static const struct blk_mq_ops nvme_mq_ops = {
.map_queues = nvme_pci_map_queues,
.timeout = nvme_timeout,
.poll = nvme_poll,
+#ifdef CONFIG_HAS_DMA
+ .dma_map = nvme_pci_dma_map,
+ .dma_unmap = nvme_pci_dma_unmap,
+#endif
};
static void nvme_dev_remove_admin(struct nvme_dev *dev)
--
2.30.2
prev parent reply other threads:[~2022-07-26 17:38 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-07-26 17:38 [PATCH 0/5] dma mapping optimisations Keith Busch
2022-07-26 17:38 ` [PATCH 1/5] blk-mq: add ops to dma map bvec Keith Busch
2022-07-26 17:38 ` [PATCH 2/5] iov_iter: introduce type for preregistered dma tags Keith Busch
2022-07-26 23:10 ` Al Viro
2022-07-27 13:52 ` Keith Busch
2022-07-26 17:38 ` [PATCH 3/5] block: add dma tag bio type Keith Busch
2022-07-26 17:38 ` [PATCH 4/5] io_uring: add support for dma pre-mapping Keith Busch
2022-07-26 23:12 ` Al Viro
2022-07-27 13:58 ` Keith Busch
2022-07-27 14:04 ` Al Viro
2022-07-27 15:04 ` Keith Busch
2022-07-27 22:32 ` Dave Chinner
2022-07-27 23:00 ` Keith Busch
2022-07-28 2:35 ` Dave Chinner
2022-07-28 13:25 ` Keith Busch
2022-07-27 14:11 ` Al Viro
2022-07-27 14:48 ` Keith Busch
2022-07-27 15:26 ` Al Viro
2022-07-26 17:38 ` Keith Busch [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
[email protected] \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox