* [PATCH 01/15] fs: add write stream information to statx
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 02/15] fs: add a write stream field to the kiocb Christoph Hellwig
` (13 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
From: Keith Busch <[email protected]>
Add new statx field to report the maximum number of write streams
supported and the granularity for them.
Signed-off-by: Keith Busch <[email protected]>
[hch: s/write_hint/write_stream/g, add granularity]
Signed-off-by: Christoph Hellwig <[email protected]>
---
fs/stat.c | 2 ++
include/linux/stat.h | 2 ++
include/uapi/linux/stat.h | 7 +++++--
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/fs/stat.c b/fs/stat.c
index 41e598376d7e..aa2b7fa4a877 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -704,6 +704,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
+ tmp.stx_write_stream_granularity = stat->write_stream_granularity;
+ tmp.stx_write_stream_max = stat->write_stream_max;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 3d900c86981c..36d4dfb291ab 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -57,6 +57,8 @@ struct kstat {
u32 atomic_write_unit_min;
u32 atomic_write_unit_max;
u32 atomic_write_segments_max;
+ u32 write_stream_granularity;
+ u16 write_stream_max;
};
/* These definitions are internal to the kernel for now. Mainly used by nfsd. */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 887a25286441..547c62a1a3a7 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -132,9 +132,11 @@ struct statx {
__u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */
/* 0xb0 */
__u32 stx_atomic_write_segments_max; /* Max atomic write segment count */
- __u32 __spare1[1];
+ __u32 stx_write_stream_granularity;
/* 0xb8 */
- __u64 __spare3[9]; /* Spare space for future expansion */
+ __u16 stx_write_stream_max;
+ __u16 __sparse2[3];
+ __u64 __spare3[8]; /* Spare space for future expansion */
/* 0x100 */
};
@@ -164,6 +166,7 @@ struct statx {
#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
#define STATX_SUBVOL 0x00008000U /* Want/got stx_subvol */
#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */
+#define STATX_WRITE_STREAM 0x00020000U /* Want/got write_stream_* */
#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 02/15] fs: add a write stream field to the kiocb
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
2024-11-19 12:16 ` [PATCH 01/15] fs: add write stream information to statx Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 03/15] io_uring: enable passing a per-io write stream Christoph Hellwig
` (12 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Prepare for io_uring passthrough of write streams.
The write stream field in the kiocb structure fits into an existing
2-byte hole, so its size is not changed.
Based on a patch from Keith Busch <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/fs.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4b5cad44a126..1997be247b6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -370,6 +370,7 @@ struct kiocb {
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
+ u8 ki_write_stream;
union {
/*
* Only used for async buffered reads, where it denotes the
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 03/15] io_uring: enable passing a per-io write stream
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
2024-11-19 12:16 ` [PATCH 01/15] fs: add write stream information to statx Christoph Hellwig
2024-11-19 12:16 ` [PATCH 02/15] fs: add a write stream field to the kiocb Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 04/15] block: don't bother checking the data direction for merges Christoph Hellwig
` (11 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
From: Kanchan Joshi <[email protected]>
Allow userspace to pass a per-I/O write stream in the SQE:
__u16 write_stream;
Application can query the supported values from the statx
max_write_streams field. Unsupported values are ignored by
file operations that do not support write streams or rejected
with an error by those that support them.
Signed-off-by: Kanchan Joshi <[email protected]>
Signed-off-by: Nitesh Shetty <[email protected]>
Signed-off-by: Keith Busch <[email protected]>
[hch: s/write_hints/write_streams/g]
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/uapi/linux/io_uring.h | 4 ++++
io_uring/io_uring.c | 2 ++
io_uring/rw.c | 2 +-
3 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index aac9a4f8fa9a..7a6a1b3726d3 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -98,6 +98,10 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
+ struct {
+ __u64 __pad4[1];
+ __u16 write_stream;
+ };
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index da8fd460977b..a54da2dd83a1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3868,6 +3868,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(48, __u64, __pad4);
+ BUILD_BUG_SQE_ELEM(56, __u16, write_stream);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/rw.c b/io_uring/rw.c
index cce8bc2ecd3f..88a5b5f65a9b 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -279,7 +279,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
-
+ rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 04/15] block: don't bother checking the data direction for merges
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (2 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 03/15] io_uring: enable passing a per-io write stream Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 05/15] block: req->bio is always set in the merge code Christoph Hellwig
` (10 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Because it already is encoded in the opcode.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-merge.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e0b28e9298c9..64860cbd5e27 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -864,9 +864,6 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
- if (rq_data_dir(req) != rq_data_dir(next))
- return NULL;
-
if (req->bio && next->bio) {
/* Don't merge requests with different write hints. */
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
@@ -986,10 +983,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (req_op(rq) != bio_op(bio))
return false;
- /* different data direction or already started, don't merge */
- if (bio_data_dir(bio) != rq_data_dir(rq))
- return false;
-
/* don't merge across cgroup boundaries */
if (!blk_cgroup_mergeable(rq, bio))
return false;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 05/15] block: req->bio is always set in the merge code
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (3 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 04/15] block: don't bother checking the data direction for merges Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 06/15] block: add a bi_write_stream field Christoph Hellwig
` (9 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring, Dan Carpenter
As smatch, which is a lot smarter than me noticed. So remove the checks
for it, and condense these checks a bit including the comments stating
the obvious.
Reported-by: Dan Carpenter <[email protected]>
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/blk-merge.c | 30 ++++++++----------------------
1 file changed, 8 insertions(+), 22 deletions(-)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 64860cbd5e27..e01383c6e534 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -864,14 +864,10 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
- if (req->bio && next->bio) {
- /* Don't merge requests with different write hints. */
- if (req->bio->bi_write_hint != next->bio->bi_write_hint)
- return NULL;
- if (req->bio->bi_ioprio != next->bio->bi_ioprio)
- return NULL;
- }
-
+ if (req->bio->bi_write_hint != next->bio->bi_write_hint)
+ return NULL;
+ if (req->bio->bi_ioprio != next->bio->bi_ioprio)
+ return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
return NULL;
@@ -983,26 +979,16 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (req_op(rq) != bio_op(bio))
return false;
- /* don't merge across cgroup boundaries */
if (!blk_cgroup_mergeable(rq, bio))
return false;
-
- /* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
-
- /* Only merge if the crypt contexts are compatible */
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
-
- if (rq->bio) {
- /* Don't merge requests with different write hints. */
- if (rq->bio->bi_write_hint != bio->bi_write_hint)
- return false;
- if (rq->bio->bi_ioprio != bio->bi_ioprio)
- return false;
- }
-
+ if (rq->bio->bi_write_hint != bio->bi_write_hint)
+ return false;
+ if (rq->bio->bi_ioprio != bio->bi_ioprio)
+ return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
return false;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 06/15] block: add a bi_write_stream field
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (4 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 05/15] block: req->bio is always set in the merge code Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 07/15] block: introduce max_write_streams queue limit Christoph Hellwig
` (8 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Add the ability to pass a write stream for placement control in the bio.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/bio.c | 2 ++
block/blk-crypto-fallback.c | 1 +
block/blk-merge.c | 4 ++++
block/bounce.c | 1 +
include/linux/blk_types.h | 1 +
5 files changed, 9 insertions(+)
diff --git a/block/bio.c b/block/bio.c
index 699a78c85c75..2aa86edc7cd6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 29a205482617..66762243a886 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -173,6 +173,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index e01383c6e534..1e5327fb6c45 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -866,6 +866,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (req->bio->bi_write_hint != next->bio->bi_write_hint)
return NULL;
+ if (req->bio->bi_write_stream != next->bio->bi_write_stream)
+ return NULL;
if (req->bio->bi_ioprio != next->bio->bi_ioprio)
return NULL;
if (!blk_atomic_write_mergeable_rqs(req, next))
@@ -987,6 +989,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
return false;
if (rq->bio->bi_write_hint != bio->bi_write_hint)
return false;
+ if (rq->bio->bi_write_stream != bio->bi_write_stream)
+ return false;
if (rq->bio->bi_ioprio != bio->bi_ioprio)
return false;
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
diff --git a/block/bounce.c b/block/bounce.c
index 0d898cd5ec49..fb8f60f114d7 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -170,6 +170,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dce7615c35e7..4ca3449ce9c9 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -220,6 +220,7 @@ struct bio {
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
enum rw_hint bi_write_hint;
+ u8 bi_write_stream;
blk_status_t bi_status;
atomic_t __bi_remaining;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 07/15] block: introduce max_write_streams queue limit
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (5 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 06/15] block: add a bi_write_stream field Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 08/15] block: introduce a write_stream_granularity " Christoph Hellwig
` (7 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
From: Keith Busch <[email protected]>
Drivers with hardware that support write streams need a way to export how
many are available so applications can generically query this.
Note: compared to Keith's origina version this does not automatically
stack the limit. There is no good way to generically stack them. For
mirroring or striping just mirroring the write streams will work, but
for everything more complex the stacking drive actually needs to manage
them.
Signed-off-by: Keith Busch <[email protected]>
[hch: renamed from max_write_hints to max_write_streams]
Signed-off-by: Christoph Hellwig <[email protected]>
---
Documentation/ABI/stable/sysfs-block | 7 +++++++
block/blk-sysfs.c | 3 +++
include/linux/blkdev.h | 9 +++++++++
3 files changed, 19 insertions(+)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 835361110715..ae8644726422 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -506,6 +506,13 @@ Description:
[RO] Maximum size in bytes of a single element in a DMA
scatter/gather list.
+What: /sys/block/<disk>/queue/max_write_streams
+Date: November 2024
+Contact: [email protected]
+Description:
+ [RO] Maximum number of write streams supported, 0 if not
+ supported. If supported, valid values are 1 through
+ max_write_streams, inclusive.
What: /sys/block/<disk>/queue/max_segments
Date: March 2010
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4241aea84161..c514c0cb5e93 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -104,6 +104,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
+QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -446,6 +447,7 @@ QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_RO_ENTRY(queue_max_write_streams, "max_write_streams");
QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@@ -580,6 +582,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
+ &queue_max_write_streams_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a1fd0ddce5cf..202e1becd410 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -395,6 +395,8 @@ struct queue_limits {
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
+ unsigned short max_write_streams;
+
unsigned int max_open_zones;
unsigned int max_active_zones;
@@ -1236,6 +1238,13 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
return queue_max_segments(bdev_get_queue(bdev));
}
+static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
+{
+ if (bdev_is_partition(bdev))
+ return 0;
+ return bdev_limits(bdev)->max_write_streams;
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
return q->limits.logical_block_size;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 08/15] block: introduce a write_stream_granularity queue limit
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (6 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 07/15] block: introduce max_write_streams queue limit Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 09/15] block: expose write streams for block device nodes Christoph Hellwig
` (6 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Export the granularity that write streams should be discarded with,
as it is essential for making good use of them.
Signed-off-by: Christoph Hellwig <[email protected]>
---
Documentation/ABI/stable/sysfs-block | 8 ++++++++
block/blk-sysfs.c | 3 +++
include/linux/blkdev.h | 7 +++++++
3 files changed, 18 insertions(+)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index ae8644726422..9f2a3005c41c 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -514,6 +514,14 @@ Description:
supported. If supported, valid values are 1 through
max_write_streams, inclusive.
+What: /sys/block/<disk>/queue/write_stream_granularity
+Date: November 2024
+Contact: [email protected]
+Description:
+ [RO] Granularity of a write stream in bytes. The granularity
+ of a write stream is the size that should be discarded or
+ overwritten together to avoid write amplification in the device.
+
What: /sys/block/<disk>/queue/max_segments
Date: March 2010
Contact: [email protected]
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c514c0cb5e93..525f4fa132cd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,7 @@ QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
+QUEUE_SYSFS_LIMIT_SHOW(write_stream_granularity)
QUEUE_SYSFS_LIMIT_SHOW(logical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(physical_block_size)
QUEUE_SYSFS_LIMIT_SHOW(chunk_sectors)
@@ -448,6 +449,7 @@ QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
QUEUE_RO_ENTRY(queue_max_write_streams, "max_write_streams");
+QUEUE_RO_ENTRY(queue_write_stream_granularity, "write_stream_granularity");
QUEUE_RW_LOAD_MODULE_ENTRY(elv_iosched, "scheduler");
QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
@@ -583,6 +585,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&queue_max_write_streams_entry.attr,
+ &queue_write_stream_granularity_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 202e1becd410..9fda66530d9a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -396,6 +396,7 @@ struct queue_limits {
unsigned short max_discard_segments;
unsigned short max_write_streams;
+ unsigned int write_stream_granularity;
unsigned int max_open_zones;
unsigned int max_active_zones;
@@ -1245,6 +1246,12 @@ static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
return bdev_limits(bdev)->max_write_streams;
}
+static inline unsigned int
+bdev_write_stream_granularity(struct block_device *bdev)
+{
+ return bdev_limits(bdev)->write_stream_granularity;
+}
+
static inline unsigned queue_logical_block_size(const struct request_queue *q)
{
return q->limits.logical_block_size;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 09/15] block: expose write streams for block device nodes
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (7 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 08/15] block: introduce a write_stream_granularity " Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 10/15] nvme: store the endurance group id in struct nvme_ns_head Christoph Hellwig
` (5 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Export statx information about the number and granularity of write
streams, use the per-kiocb write hint and map temperature hints
to write streams (which is a bit questionable, but this shows how it is
done).
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/bdev.c | 6 ++++++
block/fops.c | 23 +++++++++++++++++++++++
2 files changed, 29 insertions(+)
diff --git a/block/bdev.c b/block/bdev.c
index 738e3c8457e7..c23245f1fdfe 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1296,6 +1296,12 @@ void bdev_statx(struct path *path, struct kstat *stat,
stat->result_mask |= STATX_DIOALIGN;
}
+ if ((request_mask & STATX_WRITE_STREAM) &&
+ bdev_max_write_streams(bdev)) {
+ stat->write_stream_max = bdev_max_write_streams(bdev);
+ stat->result_mask |= STATX_WRITE_STREAM;
+ }
+
if (request_mask & STATX_WRITE_ATOMIC && bdev_can_atomic_write(bdev)) {
struct request_queue *bd_queue = bdev->bd_queue;
diff --git a/block/fops.c b/block/fops.c
index 2d01c9007681..2a860dbe5e48 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -72,6 +72,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio.bi_write_stream = iocb->ki_write_stream;
bio.bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_ATOMIC)
bio.bi_opf |= REQ_ATOMIC;
@@ -201,6 +202,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -317,6 +319,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
+ bio->bi_write_stream = iocb->ki_write_stream;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -373,6 +376,26 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
+ if (iov_iter_rw(iter) == WRITE) {
+ u16 max_write_streams = bdev_max_write_streams(bdev);
+
+ if (iocb->ki_write_stream) {
+ if (iocb->ki_write_stream > max_write_streams)
+ return -EINVAL;
+ } else if (max_write_streams) {
+ enum rw_hint write_hint =
+ file_inode(iocb->ki_filp)->i_write_hint;
+
+ /*
+ * Just use the write hint as write stream for block
+ * device writes. This assumes no file system is
+ * mounted that would use the streams differently.
+ */
+ if (write_hint <= max_write_streams)
+ iocb->ki_write_stream = write_hint;
+ }
+ }
+
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 10/15] nvme: store the endurance group id in struct nvme_ns_head
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (8 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 09/15] block: expose write streams for block device nodes Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 11/15] nvme: pass a void pointer to nvme_get/set_features for the result Christoph Hellwig
` (4 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
The FDP code needs this.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/nvme/host/core.c | 4 ++++
drivers/nvme/host/nvme.h | 2 ++
2 files changed, 6 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1a8d32a4a5c3..d194b36b08ac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -38,6 +38,7 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
+ u16 endgid;
bool is_shared;
bool is_readonly;
bool is_ready;
@@ -1600,6 +1601,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
}
info->anagrpid = id->anagrpid;
+ info->endgid = le16_to_cpu(id->endgid);
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
@@ -1638,6 +1640,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
if (!ret) {
info->anagrpid = id->anagrpid;
+ info->endgid = le16_to_cpu(id->endgid);
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
@@ -3644,6 +3647,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ids = info->ids;
head->shared = info->is_shared;
head->rotational = info->is_rotational;
+ head->endgid = info->endgid;
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 900719c4c70c..9b916a904f00 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -491,6 +491,8 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
+
+ u16 endgid;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 11/15] nvme: pass a void pointer to nvme_get/set_features for the result
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (9 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 10/15] nvme: store the endurance group id in struct nvme_ns_head Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 12/15] nvme: add a nvme_get_log_lsi helper Christoph Hellwig
` (3 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
That allows passing in structures instead of the u32 result, and thus
reduce the amount of bit shifting and masking required to parse the
result.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/nvme/host/core.c | 4 ++--
drivers/nvme/host/nvme.h | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d194b36b08ac..0d058276845b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1671,7 +1671,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@@ -1680,7 +1680,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b916a904f00..8cea8416b0d2 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -892,10 +892,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 12/15] nvme: add a nvme_get_log_lsi helper
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (10 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 11/15] nvme: pass a void pointer to nvme_get/set_features for the result Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 13/15] nvme.h: add FDP definitions Christoph Hellwig
` (2 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
For log pages that need to pass in a LSI value, while at the same time
not touching all the existing nvme_get_log callers.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/nvme/host/core.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0d058276845b..b61225201b47 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -151,6 +151,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -3069,8 +3071,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
- void *log, size_t size, u64 offset)
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@@ -3084,10 +3086,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
+ c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+ void *log, size_t size, u64 offset)
+{
+ return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
+ offset, 0);
+}
+
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 13/15] nvme.h: add FDP definitions
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (11 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 12/15] nvme: add a nvme_get_log_lsi helper Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 12:16 ` [PATCH 14/15] nvme: enable FDP support Christoph Hellwig
2024-11-19 12:16 ` [PATCH 15/15] RFC: block: allow write streams on partitions Christoph Hellwig
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Add the config feature result, config log page, and management receive
commands needed for FDP.
Partially based on a patch from Kanchan Joshi <[email protected]>.
Signed-off-by: Christoph Hellwig <[email protected]>
---
include/linux/nvme.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 0a6e22038ce3..0c4a81bf878e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -275,6 +275,7 @@ enum nvme_ctrl_attr {
NVME_CTRL_ATTR_HID_128_BIT = (1 << 0),
NVME_CTRL_ATTR_TBKAS = (1 << 6),
NVME_CTRL_ATTR_ELBAS = (1 << 15),
+ NVME_CTRL_ATTR_FDPS = (1 << 19),
};
struct nvme_id_ctrl {
@@ -656,6 +657,44 @@ struct nvme_rotational_media_log {
__u8 rsvd24[488];
};
+struct nvme_fdp_config {
+ __u8 flags;
+#define FDPCFG_FDPE (1U << 0)
+ __u8 fdpcidx;
+ __le16 reserved;
+};
+
+struct nvme_fdp_ruh_desc {
+ __u8 ruht;
+ __u8 reserved[3];
+};
+
+struct nvme_fdp_config_desc {
+ __le16 size;
+ __u8 fdpa;
+ __u8 vss;
+ __le32 nrg;
+ __le16 nruh;
+ __le16 maxpids;
+ __le32 nnss;
+ __le64 runs;
+ __le32 erutl;
+ __u8 reserved[36];
+ struct nvme_fdp_ruh_desc ruhs[];
+};
+
+struct nvme_fdp_config_log {
+ __le16 n;
+ __u8 version;
+ __u8 reserved;
+ __le32 size;
+ __u8 reserved2[8];
+ /*
+ * This is followed by variable number of nvme_fdp_config_desc
+ * structures, but sparse doesn't like nested variable sized arrays.
+ */
+};
+
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
@@ -882,6 +921,7 @@ enum nvme_opcode {
nvme_cmd_resv_register = 0x0d,
nvme_cmd_resv_report = 0x0e,
nvme_cmd_resv_acquire = 0x11,
+ nvme_cmd_io_mgmt_recv = 0x12,
nvme_cmd_resv_release = 0x15,
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
@@ -903,6 +943,7 @@ enum nvme_opcode {
nvme_opcode_name(nvme_cmd_resv_register), \
nvme_opcode_name(nvme_cmd_resv_report), \
nvme_opcode_name(nvme_cmd_resv_acquire), \
+ nvme_opcode_name(nvme_cmd_io_mgmt_recv), \
nvme_opcode_name(nvme_cmd_resv_release), \
nvme_opcode_name(nvme_cmd_zone_mgmt_send), \
nvme_opcode_name(nvme_cmd_zone_mgmt_recv), \
@@ -1054,6 +1095,7 @@ enum {
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_DTYPE_STREAMS = 1 << 4,
+ NVME_RW_DTYPE_DPLCMT = 2 << 4,
NVME_WZ_DEAC = 1 << 9,
};
@@ -1141,6 +1183,38 @@ struct nvme_zone_mgmt_recv_cmd {
__le32 cdw14[2];
};
+struct nvme_io_mgmt_recv_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __le32 nsid;
+ __le64 rsvd2[2];
+ union nvme_data_ptr dptr;
+ __u8 mo;
+ __u8 rsvd11;
+ __u16 mos;
+ __le32 numd;
+ __le32 cdw12[4];
+};
+
+enum {
+ NVME_IO_MGMT_RECV_MO_RUHS = 1,
+};
+
+struct nvme_fdp_ruh_status_desc {
+ __le16 pid;
+ __le16 ruhid;
+ __le32 earutr;
+ __le64 ruamw;
+ __u8 reserved[16];
+};
+
+struct nvme_fdp_ruh_status {
+ __u8 rsvd0[14];
+ __le16 nruhsd;
+ struct nvme_fdp_ruh_status_desc ruhsd[];
+};
+
enum {
NVME_ZRA_ZONE_REPORT = 0,
NVME_ZRASF_ZONE_REPORT_ALL = 0,
@@ -1276,6 +1350,7 @@ enum {
NVME_FEAT_PLM_WINDOW = 0x14,
NVME_FEAT_HOST_BEHAVIOR = 0x16,
NVME_FEAT_SANITIZE = 0x17,
+ NVME_FEAT_FDP = 0x1d,
NVME_FEAT_SW_PROGRESS = 0x80,
NVME_FEAT_HOST_ID = 0x81,
NVME_FEAT_RESV_MASK = 0x82,
@@ -1296,6 +1371,7 @@ enum {
NVME_LOG_ANA = 0x0c,
NVME_LOG_FEATURES = 0x12,
NVME_LOG_RMI = 0x16,
+ NVME_LOG_FDP_CONFIGS = 0x20,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -1883,6 +1959,7 @@ struct nvme_command {
struct nvmf_auth_receive_command auth_receive;
struct nvme_dbbuf dbbuf;
struct nvme_directive_cmd directive;
+ struct nvme_io_mgmt_recv_cmd imr;
};
};
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 14/15] nvme: enable FDP support
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (12 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 13/15] nvme.h: add FDP definitions Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
2024-11-19 18:17 ` Keith Busch
2024-11-19 12:16 ` [PATCH 15/15] RFC: block: allow write streams on partitions Christoph Hellwig
14 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
Wire up the block level write streams to the NVMe Flexible Data Placement
(FDP) feature as ratified in TP 4146a.
Based on code from Kanchan Joshi <[email protected]>,
Hui Qi <[email protected]>, Nitesh Shetty <[email protected]> and
Keith Busch <[email protected]>, but a lot of it has been rewritten to
fit the block layer write stream infrastructure.
Signed-off-by: Christoph Hellwig <[email protected]>
---
drivers/nvme/host/core.c | 129 +++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/nvme.h | 4 ++
2 files changed, 133 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b61225201b47..543bbe7de063 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -673,6 +673,7 @@ static void nvme_free_ns_head(struct kref *ref)
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
+ kfree(head->plids);
kfree(head);
}
@@ -990,6 +991,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (op == nvme_cmd_write && ns->head->nr_plids) {
+ u16 write_stream = req->bio->bi_write_stream;
+
+ if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+ return BLK_STS_INVAL;
+ dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+ control |= NVME_RW_DTYPE_DPLCMT;
+ }
+
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@@ -2142,6 +2152,107 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
+static int nvme_read_fdp_config(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ struct nvme_fdp_config result;
+ struct nvme_fdp_config_log *log;
+ struct nvme_fdp_config_desc *configs;
+ size_t log_size;
+ int error;
+
+ error = nvme_get_features(ns->ctrl, NVME_FEAT_FDP, info->endgid, NULL,
+ 0, &result);
+ if (error)
+ return error;
+
+ if (!(result.flags & FDPCFG_FDPE)) {
+ dev_warn(ns->ctrl->device, "FDP not enable in current config\n");
+ return -EINVAL;
+ }
+
+ log_size = sizeof(*log) + (result.fdpcidx + 1) * sizeof(*configs);
+ log = kmalloc(log_size, GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+
+ error = nvme_get_log_lsi(ns->ctrl, info->nsid, NVME_LOG_FDP_CONFIGS,
+ 0, 0, log, log_size, 0, info->endgid);
+ if (error) {
+ dev_warn(ns->ctrl->device,
+ "failed to read FDP config log: 0x%x\n", error);
+ goto out_free_log;
+ }
+
+ if (le32_to_cpu(log->size) < log_size) {
+ dev_warn(ns->ctrl->device, "FDP log too small: %d vs %zd\n",
+ le32_to_cpu(log->size), log_size);
+ error = -EINVAL;
+ goto out_free_log;
+ }
+
+ configs = (struct nvme_fdp_config_desc *)(log + 1);
+ if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
+ dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
+ return -EINVAL;
+ }
+ ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);
+
+out_free_log:
+ kfree(log);
+ return error;
+}
+
+static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
+{
+ struct nvme_ns_head *head = ns->head;
+ struct nvme_fdp_ruh_status *ruhs;
+ const unsigned int max_nr_plids = S8_MAX - 1;
+ size_t size = struct_size(ruhs, ruhsd, max_nr_plids);
+ struct nvme_command c = {
+ .imr.opcode = nvme_cmd_io_mgmt_recv,
+ .imr.nsid = cpu_to_le32(nsid),
+ .imr.mo = NVME_IO_MGMT_RECV_MO_RUHS,
+ .imr.numd = cpu_to_le32(nvme_bytes_to_numd(size)),
+ };
+ int ret, i;
+
+ ruhs = kzalloc(size, GFP_KERNEL);
+ if (!ruhs)
+ return -ENOMEM;
+
+ ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+ if (ret) {
+ dev_warn(ns->ctrl->device,
+ "failed to read FDP reclaim unit handles: 0x%x\n", ret);
+ goto out;
+ }
+
+ ns->head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+ if (!ns->head->nr_plids)
+ goto out;
+
+ if (ns->head->nr_plids > max_nr_plids) {
+ dev_info(ns->ctrl->device,
+ "capping max write streams from %d to %d\n",
+ ns->head->nr_plids, max_nr_plids);
+ ns->head->nr_plids = max_nr_plids;
+ }
+
+ head->plids = kcalloc(ns->head->nr_plids, sizeof(head->plids),
+ GFP_KERNEL);
+ if (!head->plids) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < ns->head->nr_plids; i++)
+ head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
+
+out:
+ kfree(ruhs);
+ return ret;
+}
+
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@@ -2178,6 +2289,18 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (!(ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS)) {
+ ns->head->nr_plids = 0;
+ kfree(ns->head->plids);
+ ns->head->plids = NULL;
+ } else if (!ns->head->plids) {
+ ret = nvme_read_fdp_config(ns, info);
+ if (!ret)
+ ret = nvme_fetch_fdp_plids(ns, info->nsid);
+ if (ret < 0)
+ goto out;
+ }
+
blk_mq_freeze_queue(ns->disk->queue);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
@@ -2211,6 +2334,10 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
+ lim.max_write_streams = ns->head->nr_plids;
+ if (lim.max_write_streams)
+ lim.write_stream_granularity = ns->head->runs;
+
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2313,6 +2440,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
+ lim.max_write_streams = ns_lim->max_write_streams;
+ lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8cea8416b0d2..f10aa0cb6df5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -493,6 +493,10 @@ struct nvme_ns_head {
struct gendisk *disk;
u16 endgid;
+ u16 nr_plids;
+ u16 *plids;
+ u64 runs;
+
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 14/15] nvme: enable FDP support
2024-11-19 12:16 ` [PATCH 14/15] nvme: enable FDP support Christoph Hellwig
@ 2024-11-19 18:17 ` Keith Busch
2024-11-19 18:24 ` Christoph Hellwig
0 siblings, 1 reply; 20+ messages in thread
From: Keith Busch @ 2024-11-19 18:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jens Axboe, Christian Brauner, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
On Tue, Nov 19, 2024 at 01:16:28PM +0100, Christoph Hellwig wrote:
> +static int nvme_read_fdp_config(struct nvme_ns *ns, struct nvme_ns_info *info)
> +{
> + struct nvme_fdp_config result;
> + struct nvme_fdp_config_log *log;
> + struct nvme_fdp_config_desc *configs;
> + size_t log_size;
> + int error;
> +
> + error = nvme_get_features(ns->ctrl, NVME_FEAT_FDP, info->endgid, NULL,
> + 0, &result);
> + if (error)
> + return error;
> +
> + if (!(result.flags & FDPCFG_FDPE)) {
> + dev_warn(ns->ctrl->device, "FDP not enable in current config\n");
> + return -EINVAL;
> + }
> +
> + log_size = sizeof(*log) + (result.fdpcidx + 1) * sizeof(*configs);
> + log = kmalloc(log_size, GFP_KERNEL);
> + if (!log)
> + return -ENOMEM;
> +
> + error = nvme_get_log_lsi(ns->ctrl, info->nsid, NVME_LOG_FDP_CONFIGS,
> + 0, 0, log, log_size, 0, info->endgid);
> + if (error) {
> + dev_warn(ns->ctrl->device,
> + "failed to read FDP config log: 0x%x\n", error);
> + goto out_free_log;
> + }
> +
> + if (le32_to_cpu(log->size) < log_size) {
> + dev_warn(ns->ctrl->device, "FDP log too small: %d vs %zd\n",
> + le32_to_cpu(log->size), log_size);
> + error = -EINVAL;
> + goto out_free_log;
> + }
> +
> + configs = (struct nvme_fdp_config_desc *)(log + 1);
> + if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> + dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
Why not support multiple reclaim groups?
> + return -EINVAL;
> + }
> + ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);
The config descriptors are variable length, so you can't just index into
it. You have to read each index individually to get the next index's offset.
Something like:
struct nvme_fdp_config_desc *configs;
void *l;
int i;
...
l = log + 1;
for (i = 0; i < result.fdpcidx; i++) {
configs = l;
l += le16_to_cpu(configs->size);
}
ns->head->runs = le64_to_cpu(configs->runs);
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 14/15] nvme: enable FDP support
2024-11-19 18:17 ` Keith Busch
@ 2024-11-19 18:24 ` Christoph Hellwig
2024-11-19 22:49 ` Keith Busch
0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 18:24 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Sagi Grimberg,
Kanchan Joshi, Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov,
linux-block, linux-kernel, linux-nvme, linux-fsdevel, io-uring
On Tue, Nov 19, 2024 at 11:17:36AM -0700, Keith Busch wrote:
> > + if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> > + dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
>
> Why not support multiple reclaim groups?
Can you come up with a sane API for that? And can you find devices in
the wild that actually support it?
> > + ns->head->runs = le64_to_cpu(configs[result.fdpcidx].runs);
>
> The config descriptors are variable length, so you can't just index into
> it. You have to read each index individually to get the next index's offset.
> Something like:
Indeed. The current code only works when the first config is selected.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 14/15] nvme: enable FDP support
2024-11-19 18:24 ` Christoph Hellwig
@ 2024-11-19 22:49 ` Keith Busch
2024-11-20 6:03 ` Christoph Hellwig
0 siblings, 1 reply; 20+ messages in thread
From: Keith Busch @ 2024-11-19 22:49 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Jens Axboe, Christian Brauner, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
On Tue, Nov 19, 2024 at 07:24:27PM +0100, Christoph Hellwig wrote:
> On Tue, Nov 19, 2024 at 11:17:36AM -0700, Keith Busch wrote:
> > > + if (le32_to_cpu(configs[result.fdpcidx].nrg) > 1) {
> > > + dev_warn(ns->ctrl->device, "FDP NRG > 1 not supported\n");
> >
> > Why not support multiple reclaim groups?
>
> Can you come up with a sane API for that?
Haven't really thought about it. If it's there, it's probably useful for
RU's that are not "Persistently Isolated". But let's not worry about it
now, we can just say you don't get to use write streams for these.
> And can you find devices in
> the wild that actually support it?
I haven't come across any, no.
But more about the return codes for pretty much all the errors here.
They'll prevent the namespace from being visible, but I think you just
want to set the limits to disable write streams instead. Otherwise it'd
be a regression since namespaces configured this way are currently
usable.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 14/15] nvme: enable FDP support
2024-11-19 22:49 ` Keith Busch
@ 2024-11-20 6:03 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-20 6:03 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Jens Axboe, Christian Brauner, Sagi Grimberg,
Kanchan Joshi, Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov,
linux-block, linux-kernel, linux-nvme, linux-fsdevel, io-uring
On Tue, Nov 19, 2024 at 03:49:14PM -0700, Keith Busch wrote:
> But more about the return codes for pretty much all the errors here.
> They'll prevent the namespace from being visible, but I think you just
> want to set the limits to disable write streams instead. Otherwise it'd
> be a regression since namespaces configured this way are currently
> usable.
True, we should probably just log an error and continue here. I'll
update it for the next version.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 15/15] RFC: block: allow write streams on partitions
2024-11-19 12:16 support block layer write streams and FDP Christoph Hellwig
` (13 preceding siblings ...)
2024-11-19 12:16 ` [PATCH 14/15] nvme: enable FDP support Christoph Hellwig
@ 2024-11-19 12:16 ` Christoph Hellwig
14 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-11-19 12:16 UTC (permalink / raw)
To: Jens Axboe
Cc: Christian Brauner, Keith Busch, Sagi Grimberg, Kanchan Joshi,
Hui Qi, Nitesh Shetty, Jan Kara, Pavel Begunkov, linux-block,
linux-kernel, linux-nvme, linux-fsdevel, io-uring
By default assign all write streams to partition 1, and add a hack
sysfs files that distributes them all equally.
This is implemented by storing the number of per-partition write
streams in struct block device, as well as the offset to the global
ones, and then remapping the write streams in the I/O submission
path.
The sysfs is hacky and undocumented, better suggestions welcome
from actual users of write stream on partitions.
Signed-off-by: Christoph Hellwig <[email protected]>
---
block/bdev.c | 9 +++++++
block/blk-core.c | 2 ++
block/genhd.c | 52 +++++++++++++++++++++++++++++++++++++++
block/partitions/core.c | 6 +++--
include/linux/blk_types.h | 7 ++++++
include/linux/blkdev.h | 2 +-
6 files changed, 75 insertions(+), 3 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index c23245f1fdfe..f3549a8cdb3f 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -440,6 +440,15 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
return NULL;
}
bdev->bd_disk = disk;
+
+ /*
+ * Assign all write streams to the first partition by default.
+ */
+ if (partno == 1) {
+ bdev->bd_part_write_stream_start = 0;
+ bdev->bd_part_write_streams = bdev_max_write_streams(bdev);
+ }
+
return bdev;
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..9654937f9b2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,6 +574,8 @@ static int blk_partition_remap(struct bio *bio)
return -EIO;
if (bio_sectors(bio)) {
bio->bi_iter.bi_sector += p->bd_start_sect;
+ if (bio->bi_write_stream)
+ bio->bi_write_stream += p->bd_part_write_stream_start;
trace_block_bio_remap(bio, p->bd_dev,
bio->bi_iter.bi_sector -
p->bd_start_sect);
diff --git a/block/genhd.c b/block/genhd.c
index 79230c109fca..3156c70522b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,6 +1070,54 @@ static ssize_t partscan_show(struct device *dev,
return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}
+static ssize_t disk_distribute_write_streams_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ /* Anything useful to show here like the ranges? */
+ return sysfs_emit(buf, "0\n");
+}
+
+static ssize_t disk_distribute_write_streams_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *bdev = disk->part0, *part;
+ unsigned short total_write_streams =
+ disk->queue->limits.max_write_streams;
+ unsigned short part_write_streams, part_write_stream_start = 0;
+ unsigned long nr_partitions = 0, idx;
+ int error = 0;
+
+ if (!total_write_streams)
+ return -EINVAL;
+
+ mutex_lock(&disk->open_mutex);
+ if (atomic_read(&bdev->bd_openers)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ xa_for_each_start(&disk->part_tbl, idx, part, 1)
+ nr_partitions++;
+ if (!nr_partitions)
+ goto out_unlock;
+
+ part_write_streams = total_write_streams / nr_partitions;
+ xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+ part->bd_part_write_streams = part_write_streams;
+ part->bd_part_write_stream_start = part_write_stream_start;
+ part_write_stream_start += part_write_streams;
+ dev_info(dev,
+ "assigning %u write streams at %u to partition %lu\n",
+ part_write_streams, part_write_stream_start, idx - 1);
+ }
+out_unlock:
+ mutex_unlock(&disk->open_mutex);
+ if (error)
+ return error;
+ return count;
+}
+
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1084,6 +1132,9 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
+static DEVICE_ATTR(distribute_write_streams, 0644,
+ disk_distribute_write_streams_show,
+ disk_distribute_write_streams_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
@@ -1135,6 +1186,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
&dev_attr_partscan.attr,
+ &dev_attr_distribute_write_streams.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b..a27dbb5589ce 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -245,8 +245,10 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
- put_disk(dev_to_bdev(dev)->bd_disk);
- bdev_drop(dev_to_bdev(dev));
+ struct block_device *part = dev_to_bdev(dev);
+
+ put_disk(part->bd_disk);
+ bdev_drop(part);
}
static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4ca3449ce9c9..02a3d58e814f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -74,6 +74,13 @@ struct block_device {
#ifdef CONFIG_SECURITY
void *bd_security;
#endif
+
+ /*
+ * Allow assigning write streams to partitions.
+ */
+ unsigned short bd_part_write_streams;
+ unsigned short bd_part_write_stream_start;
+
/*
* keep this out-of-line as it's both big and not needed in the fast
* path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fda66530d9a..bb0921e642fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1242,7 +1242,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
{
if (bdev_is_partition(bdev))
- return 0;
+ return bdev->bd_part_write_streams;
return bdev_limits(bdev)->max_write_streams;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread