* [PATCHv2] blk-mq: remove hybrid polling
@ 2023-03-20 19:49 Keith Busch
2023-03-20 20:56 ` Keith Busch
2023-03-20 20:59 ` Jens Axboe
0 siblings, 2 replies; 4+ messages in thread
From: Keith Busch @ 2023-03-20 19:49 UTC (permalink / raw)
To: linux-block, Jens Axboe, io-uring; +Cc: Pavel Begunkov, Keith Busch
From: Keith Busch <[email protected]>
io_uring provides the only way user space can poll completions, and that
always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
code, so remove it and everything supporting it.
Signed-off-by: Keith Busch <[email protected]>
---
v1->v2:
Make the queue_io_poll_delay_store() a no-op so that anyone using
it won't see errors.
Updated Documentation with the new description for the io_poll_delay
queue attribute.
Documentation/ABI/stable/sysfs-block | 15 +-
block/blk-core.c | 6 -
block/blk-mq-debugfs.c | 26 ----
block/blk-mq.c | 205 +--------------------------
block/blk-stat.c | 18 ---
block/blk-sysfs.c | 25 +---
include/linux/blk-mq.h | 2 -
include/linux/blkdev.h | 12 --
io_uring/rw.c | 2 +-
9 files changed, 12 insertions(+), 299 deletions(-)
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 282de3680367d..c57e5b7cb5326 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -336,18 +336,11 @@ What: /sys/block/<disk>/queue/io_poll_delay
Date: November 2016
Contact: [email protected]
Description:
- [RW] If polling is enabled, this controls what kind of polling
- will be performed. It defaults to -1, which is classic polling.
+ [RW] This was used to control what kind of polling will be
+ performed. It is now fixed to -1, which is classic polling.
In this mode, the CPU will repeatedly ask for completions
- without giving up any time. If set to 0, a hybrid polling mode
- is used, where the kernel will attempt to make an educated guess
- at when the IO will complete. Based on this guess, the kernel
- will put the process issuing IO to sleep for an amount of time,
- before entering a classic poll loop. This mode might be a little
- slower than pure classic polling, but it will be more efficient.
- If set to a value larger than 0, the kernel will put the process
- issuing IO to sleep for this amount of microseconds before
- entering classic polling.
+ without giving up any time.
+ <deprecated>
What: /sys/block/<disk>/queue/io_timeout
diff --git a/block/blk-core.c b/block/blk-core.c
index 9e5e0277a4d95..269765d16cfd9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -263,13 +263,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
static void blk_free_queue(struct request_queue *q)
{
- if (q->poll_stat)
- blk_stat_remove_callback(q, q->poll_cb);
- blk_stat_free_callback(q->poll_cb);
-
blk_free_queue_stats(q->stats);
- kfree(q->poll_stat);
-
if (queue_is_mq(q))
blk_mq_release(q);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index b01818f8e216e..212a7f301e730 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -15,33 +15,8 @@
#include "blk-mq-tag.h"
#include "blk-rq-qos.h"
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
- if (stat->nr_samples) {
- seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
- stat->nr_samples, stat->mean, stat->min, stat->max);
- } else {
- seq_puts(m, "samples=0");
- }
-}
-
static int queue_poll_stat_show(void *data, struct seq_file *m)
{
- struct request_queue *q = data;
- int bucket;
-
- if (!q->poll_stat)
- return 0;
-
- for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
- seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
- print_stat(m, &q->poll_stat[2 * bucket]);
- seq_puts(m, "\n");
-
- seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
- print_stat(m, &q->poll_stat[2 * bucket + 1]);
- seq_puts(m, "\n");
- }
return 0;
}
@@ -282,7 +257,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
- RQF_NAME(MQ_POLL_SLEPT),
RQF_NAME(TIMED_OUT),
RQF_NAME(ELV),
RQF_NAME(RESV),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a875b1cdff9b5..4e30459df8151 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -46,51 +46,15 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
-
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
- int ddir, sectors, bucket;
-
- ddir = rq_data_dir(rq);
- sectors = blk_rq_stats_sectors(rq);
-
- bucket = ddir + 2 * ilog2(sectors);
-
- if (bucket < 0)
- return -1;
- else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
- return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
- return bucket;
-}
-
-#define BLK_QC_T_SHIFT 16
-#define BLK_QC_T_INTERNAL (1U << 31)
-
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
- return xa_load(&q->hctx_table,
- (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
-}
-
-static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
- blk_qc_t qc)
-{
- unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1);
-
- if (qc & BLK_QC_T_INTERNAL)
- return blk_mq_tag_to_rq(hctx->sched_tags, tag);
- return blk_mq_tag_to_rq(hctx->tags, tag);
+ return xa_load(&q->hctx_table, qc);
}
static inline blk_qc_t blk_rq_to_qc(struct request *rq)
{
- return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) |
- (rq->tag != -1 ?
- rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL));
+ return rq->mq_hctx->queue_num;
}
/*
@@ -1038,10 +1002,8 @@ static inline void blk_account_io_start(struct request *req)
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
- if (rq->rq_flags & RQF_STATS) {
- blk_mq_poll_stats_start(rq->q);
+ if (rq->rq_flags & RQF_STATS)
blk_stat_add(rq, now);
- }
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -4222,14 +4184,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
- q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
- blk_mq_poll_stats_bkt,
- BLK_MQ_POLL_STATS_BKTS, q);
- if (!q->poll_cb)
- goto err_exit;
-
if (blk_mq_alloc_ctxs(q))
- goto err_poll;
+ goto err_exit;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
@@ -4257,11 +4213,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->nr_requests = set->queue_depth;
- /*
- * Default to classic polling
- */
- q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
@@ -4269,9 +4220,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
err_hctxs:
blk_mq_release(q);
-err_poll:
- blk_stat_free_callback(q->poll_cb);
- q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return -ENOMEM;
@@ -4768,138 +4716,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
- if (q->poll_stat)
- return true;
-
- return blk_stats_alloc_enable(q);
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
- /*
- * We don't arm the callback if polling stats are not enabled or the
- * callback is already active.
- */
- if (!q->poll_stat || blk_stat_is_active(q->poll_cb))
- return;
-
- blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
- struct request_queue *q = cb->data;
- int bucket;
-
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
- if (cb->stat[bucket].nr_samples)
- q->poll_stat[bucket] = cb->stat[bucket];
- }
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
- struct request *rq)
-{
- unsigned long ret = 0;
- int bucket;
-
- /*
- * If stats collection isn't on, don't sleep but turn it on for
- * future users
- */
- if (!blk_poll_stats_enable(q))
- return 0;
-
- /*
- * As an optimistic guess, use half of the mean service time
- * for this type of request. We can (and should) make this smarter.
- * For instance, if the completion latencies are tight, we can
- * get closer than just half the mean. This is especially
- * important on devices where the completion latencies are longer
- * than ~10 usec. We do use the stats for the relevant IO size
- * if available which does lead to better estimates.
- */
- bucket = blk_mq_poll_stats_bkt(rq);
- if (bucket < 0)
- return ret;
-
- if (q->poll_stat[bucket].nr_samples)
- ret = (q->poll_stat[bucket].mean + 1) / 2;
-
- return ret;
-}
-
-static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc)
-{
- struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc);
- struct request *rq = blk_qc_to_rq(hctx, qc);
- struct hrtimer_sleeper hs;
- enum hrtimer_mode mode;
- unsigned int nsecs;
- ktime_t kt;
-
- /*
- * If a request has completed on queue that uses an I/O scheduler, we
- * won't get back a request from blk_qc_to_rq.
- */
- if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT))
- return false;
-
- /*
- * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
- *
- * 0: use half of prev avg
- * >0: use this specific value
- */
- if (q->poll_nsec > 0)
- nsecs = q->poll_nsec;
- else
- nsecs = blk_mq_poll_nsecs(q, rq);
-
- if (!nsecs)
- return false;
-
- rq->rq_flags |= RQF_MQ_POLL_SLEPT;
-
- /*
- * This will be replaced with the stats tracking code, using
- * 'avg_completion_time / 2' as the pre-sleep target.
- */
- kt = nsecs;
-
- mode = HRTIMER_MODE_REL;
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
- hrtimer_set_expires(&hs.timer, kt);
-
- do {
- if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
- break;
- set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_sleeper_start_expires(&hs, mode);
- if (hs.task)
- io_schedule();
- hrtimer_cancel(&hs.timer);
- mode = HRTIMER_MODE_ABS;
- } while (hs.task && !signal_pending(current));
-
- __set_current_state(TASK_RUNNING);
- destroy_hrtimer_on_stack(&hs.timer);
-
- /*
- * If we sleep, have the caller restart the poll loop to reset the
- * state. Like for the other success return cases, the caller is
- * responsible for checking if the IO completed. If the IO isn't
- * complete, we'll get called again and will go straight to the busy
- * poll loop.
- */
- return true;
-}
-
-static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
- struct io_comp_batch *iob, unsigned int flags)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+ unsigned int flags)
{
struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie);
long state = get_current_state();
@@ -4926,17 +4744,6 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie,
return 0;
}
-int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
- unsigned int flags)
-{
- if (!(flags & BLK_POLL_NOSLEEP) &&
- q->poll_nsec != BLK_MQ_POLL_CLASSIC) {
- if (blk_mq_poll_hybrid(q, cookie))
- return 1;
- }
- return blk_mq_poll_classic(q, cookie, iob, flags);
-}
-
unsigned int blk_mq_rq_cpu(struct request *rq)
{
return rq->mq_ctx->cpu;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index c6ca16abf911e..74a1a8c32d86f 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -231,21 +231,3 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
kfree(stats);
}
-
-bool blk_stats_alloc_enable(struct request_queue *q)
-{
- struct blk_rq_stat *poll_stat;
-
- poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
- GFP_ATOMIC);
- if (!poll_stat)
- return false;
-
- if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
- kfree(poll_stat);
- return true;
- }
-
- blk_stat_add_callback(q, q->poll_cb);
- return false;
-}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f1fce1c7fa44b..1a743b4f29582 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -408,35 +408,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
{
- int val;
-
- if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
- val = BLK_MQ_POLL_CLASSIC;
- else
- val = q->poll_nsec / 1000;
-
- return sprintf(page, "%d\n", val);
+ return sprintf(page, "%d\n", -1);
}
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
size_t count)
{
- int err, val;
-
- if (!q->mq_ops || !q->mq_ops->poll)
- return -EINVAL;
-
- err = kstrtoint(page, 10, &val);
- if (err < 0)
- return err;
-
- if (val == BLK_MQ_POLL_CLASSIC)
- q->poll_nsec = BLK_MQ_POLL_CLASSIC;
- else if (val >= 0)
- q->poll_nsec = val * 1000;
- else
- return -EINVAL;
-
return count;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index dd5ce1137f04a..1dacb2c81fdda 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,8 +57,6 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
/* The per-zone write lock is held for this request */
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
-/* already slept for hybrid poll */
-#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
/* queue has elevator attached */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d1aee08f8c181..6ede578dfbc64 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,12 +44,6 @@ extern const struct device_type disk_type;
extern struct device_type part_type;
extern struct class block_class;
-/* Must be consistent with blk_mq_poll_stats_bkt() */
-#define BLK_MQ_POLL_STATS_BKTS 16
-
-/* Doing classic polling */
-#define BLK_MQ_POLL_CLASSIC -1
-
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
@@ -468,10 +462,6 @@ struct request_queue {
#endif
unsigned int rq_timeout;
- int poll_nsec;
-
- struct blk_stat_callback *poll_cb;
- struct blk_rq_stat *poll_stat;
struct timer_list timeout;
struct work_struct timeout_work;
@@ -870,8 +860,6 @@ blk_status_t errno_to_blk_status(int errno);
/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT (1 << 0)
-/* do not sleep to wait for the expected completion time */
-#define BLK_POLL_NOSLEEP (1 << 1)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
unsigned int flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 4c233910e2009..a099dc0543d95 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1002,7 +1002,7 @@ void io_rw_fail(struct io_kiocb *req)
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
- unsigned int poll_flags = BLK_POLL_NOSLEEP;
+ unsigned int poll_flags = 0;
DEFINE_IO_COMP_BATCH(iob);
int nr_events = 0;
--
2.34.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCHv2] blk-mq: remove hybrid polling
2023-03-20 19:49 [PATCHv2] blk-mq: remove hybrid polling Keith Busch
@ 2023-03-20 20:56 ` Keith Busch
2023-03-20 20:57 ` Jens Axboe
2023-03-20 20:59 ` Jens Axboe
1 sibling, 1 reply; 4+ messages in thread
From: Keith Busch @ 2023-03-20 20:56 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, Jens Axboe, io-uring, Pavel Begunkov
On Mon, Mar 20, 2023 at 12:49:26PM -0700, Keith Busch wrote:
> From: Keith Busch <[email protected]>
>
> io_uring provides the only way user space can poll completions, and that
> always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
> code, so remove it and everything supporting it.
Hybrid polling was effectively killed off with 9650b453a3d4b1, "block: ignore
RWF_HIPRI hint for sync dio", so we could add a "Fixes: " for that. It was
still potentially reachable through io_uring until d729cf9acb93119, "io_uring:
don't sleep when polling for I/O", but hybrid polling probably should not have
been reachable through that async interface from the beginning.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCHv2] blk-mq: remove hybrid polling
2023-03-20 20:56 ` Keith Busch
@ 2023-03-20 20:57 ` Jens Axboe
0 siblings, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-03-20 20:57 UTC (permalink / raw)
To: Keith Busch, Keith Busch; +Cc: linux-block, io-uring, Pavel Begunkov
On 3/20/23 2:56 PM, Keith Busch wrote:
> On Mon, Mar 20, 2023 at 12:49:26PM -0700, Keith Busch wrote:
>> From: Keith Busch <[email protected]>
>>
>> io_uring provides the only way user space can poll completions, and that
>> always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
>> code, so remove it and everything supporting it.
>
> Hybrid polling was effectively killed off with 9650b453a3d4b1, "block: ignore
> RWF_HIPRI hint for sync dio", so we could add a "Fixes: " for that. It was
> still potentially reachable through io_uring until d729cf9acb93119, "io_uring:
> don't sleep when polling for I/O", but hybrid polling probably should not have
> been reachable through that async interface from the beginning.
Thanks, I'll add it as fixing both of those. More as a reference than
anything else, it's not like we are backporting this change.
--
Jens Axboe
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCHv2] blk-mq: remove hybrid polling
2023-03-20 19:49 [PATCHv2] blk-mq: remove hybrid polling Keith Busch
2023-03-20 20:56 ` Keith Busch
@ 2023-03-20 20:59 ` Jens Axboe
1 sibling, 0 replies; 4+ messages in thread
From: Jens Axboe @ 2023-03-20 20:59 UTC (permalink / raw)
To: linux-block, io-uring, Keith Busch; +Cc: Pavel Begunkov, Keith Busch
On Mon, 20 Mar 2023 12:49:26 -0700, Keith Busch wrote:
> io_uring provides the only way user space can poll completions, and that
> always sets BLK_POLL_NOSLEEP. This effectively makes hybrid polling dead
> code, so remove it and everything supporting it.
>
>
Applied, thanks!
[1/1] blk-mq: remove hybrid polling
commit: aa939e415c6c49cabcab2bc16fda2bc38ca0c235
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-03-20 21:00 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-03-20 19:49 [PATCHv2] blk-mq: remove hybrid polling Keith Busch
2023-03-20 20:56 ` Keith Busch
2023-03-20 20:57 ` Jens Axboe
2023-03-20 20:59 ` Jens Axboe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox