public inbox for [email protected]
 help / color / mirror / Atom feed
From: Christoph Hellwig <[email protected]>
To: Jens Axboe <[email protected]>
Cc: Christian Brauner <[email protected]>,
	Keith Busch <[email protected]>, Sagi Grimberg <[email protected]>,
	Kanchan Joshi <[email protected]>,
	Hui Qi <[email protected]>,
	Nitesh Shetty <[email protected]>, Jan Kara <[email protected]>,
	Pavel Begunkov <[email protected]>,
	[email protected], [email protected],
	[email protected], [email protected],
	[email protected]
Subject: [PATCH 15/15] RFC: block: allow write streams on partitions
Date: Tue, 19 Nov 2024 13:16:29 +0100	[thread overview]
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>

By default assign all write streams to partition 1, and add a hack
sysfs files that distributes them all equally.

This is implemented by storing the number of per-partition write
streams in struct block device, as well as the offset to the global
ones, and then remapping the write streams in the I/O submission
path.

The sysfs is hacky and undocumented, better suggestions welcome
from actual users of write stream on partitions.

Signed-off-by: Christoph Hellwig <[email protected]>
---
 block/bdev.c              |  9 +++++++
 block/blk-core.c          |  2 ++
 block/genhd.c             | 52 +++++++++++++++++++++++++++++++++++++++
 block/partitions/core.c   |  6 +++--
 include/linux/blk_types.h |  7 ++++++
 include/linux/blkdev.h    |  2 +-
 6 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index c23245f1fdfe..f3549a8cdb3f 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -440,6 +440,15 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+
+	/*
+	 * Assign all write streams to the first partition by default.
+	 */
+	if (partno == 1) {
+		bdev->bd_part_write_stream_start = 0;
+		bdev->bd_part_write_streams = bdev_max_write_streams(bdev);
+	}
+
 	return bdev;
 }
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..9654937f9b2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,6 +574,8 @@ static int blk_partition_remap(struct bio *bio)
 		return -EIO;
 	if (bio_sectors(bio)) {
 		bio->bi_iter.bi_sector += p->bd_start_sect;
+		if (bio->bi_write_stream)
+			bio->bi_write_stream += p->bd_part_write_stream_start;
 		trace_block_bio_remap(bio, p->bd_dev,
 				      bio->bi_iter.bi_sector -
 				      p->bd_start_sect);
diff --git a/block/genhd.c b/block/genhd.c
index 79230c109fca..3156c70522b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,6 +1070,54 @@ static ssize_t partscan_show(struct device *dev,
 	return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
 }
 
+static ssize_t disk_distribute_write_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	/* Anything useful to show here like the ranges? */
+	return sysfs_emit(buf, "0\n");
+}
+
+static ssize_t disk_distribute_write_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct block_device *bdev = disk->part0, *part;
+	unsigned short total_write_streams =
+		disk->queue->limits.max_write_streams;
+	unsigned short part_write_streams, part_write_stream_start = 0;
+	unsigned long nr_partitions = 0, idx;
+	int error = 0;
+
+	if (!total_write_streams)
+		return -EINVAL;
+
+	mutex_lock(&disk->open_mutex);
+	if (atomic_read(&bdev->bd_openers)) {
+		error = -EBUSY;
+		goto out_unlock;
+	}
+
+	xa_for_each_start(&disk->part_tbl, idx, part, 1)
+		nr_partitions++;
+	if (!nr_partitions)
+		goto out_unlock;
+
+	part_write_streams = total_write_streams / nr_partitions;
+	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+		part->bd_part_write_streams = part_write_streams;
+		part->bd_part_write_stream_start = part_write_stream_start;
+		part_write_stream_start += part_write_streams;
+		dev_info(dev,
+			"assigning %u write streams at %u to partition %lu\n",
+			part_write_streams, part_write_stream_start, idx - 1);
+	}
+out_unlock:
+	mutex_unlock(&disk->open_mutex);
+	if (error)
+		return error;
+	return count;
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1084,6 +1132,9 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
 static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
+static DEVICE_ATTR(distribute_write_streams, 0644,
+	disk_distribute_write_streams_show,
+	disk_distribute_write_streams_store);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1135,6 +1186,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_events_poll_msecs.attr,
 	&dev_attr_diskseq.attr,
 	&dev_attr_partscan.attr,
+	&dev_attr_distribute_write_streams.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b..a27dbb5589ce 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -245,8 +245,10 @@ static const struct attribute_group *part_attr_groups[] = {
 
 static void part_release(struct device *dev)
 {
-	put_disk(dev_to_bdev(dev)->bd_disk);
-	bdev_drop(dev_to_bdev(dev));
+	struct block_device *part = dev_to_bdev(dev);
+
+	put_disk(part->bd_disk);
+	bdev_drop(part);
 }
 
 static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4ca3449ce9c9..02a3d58e814f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -74,6 +74,13 @@ struct block_device {
 #ifdef CONFIG_SECURITY
 	void			*bd_security;
 #endif
+
+	/*
+	 * Allow assigning write streams to partitions.
+	 */
+	unsigned short		bd_part_write_streams;
+	unsigned short		bd_part_write_stream_start;
+
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fda66530d9a..bb0921e642fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1242,7 +1242,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
 static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
 {
 	if (bdev_is_partition(bdev))
-		return 0;
+		return bdev->bd_part_write_streams;
 	return bdev_limits(bdev)->max_write_streams;
 }
 
-- 
2.45.2


      parent reply	other threads:[~2024-11-19 12:17 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
2024-11-19 12:16 ` [PATCH 01/15] fs: add write stream information to statx Christoph Hellwig
2024-11-19 12:16 ` [PATCH 02/15] fs: add a write stream field to the kiocb Christoph Hellwig
2024-11-19 12:16 ` [PATCH 03/15] io_uring: enable passing a per-io write stream Christoph Hellwig
2024-11-19 12:16 ` [PATCH 04/15] block: don't bother checking the data direction for merges Christoph Hellwig
2024-11-19 12:16 ` [PATCH 05/15] block: req->bio is always set in the merge code Christoph Hellwig
2024-11-19 12:16 ` [PATCH 06/15] block: add a bi_write_stream field Christoph Hellwig
2024-11-19 12:16 ` [PATCH 07/15] block: introduce max_write_streams queue limit Christoph Hellwig
2024-11-19 12:16 ` [PATCH 08/15] block: introduce a write_stream_granularity " Christoph Hellwig
2024-11-19 12:16 ` [PATCH 09/15] block: expose write streams for block device nodes Christoph Hellwig
2024-11-19 12:16 ` [PATCH 10/15] nvme: store the endurance group id in struct nvme_ns_head Christoph Hellwig
2024-11-19 12:16 ` [PATCH 11/15] nvme: pass a void pointer to nvme_get/set_features for the result Christoph Hellwig
2024-11-19 12:16 ` [PATCH 12/15] nvme: add a nvme_get_log_lsi helper Christoph Hellwig
2024-11-19 12:16 ` [PATCH 13/15] nvme.h: add FDP definitions Christoph Hellwig
2024-11-19 12:16 ` [PATCH 14/15] nvme: enable FDP support Christoph Hellwig
2024-11-19 18:17   ` Keith Busch
2024-11-19 18:24     ` Christoph Hellwig
2024-11-19 22:49       ` Keith Busch
2024-11-20  6:03         ` Christoph Hellwig
2024-11-19 12:16 ` Christoph Hellwig [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    [email protected] \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox