blk-wbt.c source code [linux/block/blk-wbt.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* buffered writeback throttling. loosely based on CoDel. We can't drop
4	* packets for IO scheduling, so the logic is something like this:
5	*
6	* - Monitor latencies in a defined window of time.
7	* - If the minimum latency in the above window exceeds some target, increment
8	* scaling step and scale down queue depth by a factor of 2x. The monitoring
9	* window is then shrunk to 100 / sqrt(scaling step + 1).
10	* - For any window where we don't have solid data on what the latencies
11	* look like, retain status quo.
12	* - If latencies look good, decrement scaling step.
13	* - If we're only doing writes, allow the scaling step to go negative. This
14	* will temporarily boost write performance, snapping back to a stable
15	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
16	* positive scaling steps where we shrink the monitoring window, a negative
17	* scaling step retains the default step==0 window size.
18	*
19	* Copyright (C) 2016 Jens Axboe
20	*
21	*/
22	#include <linux/kernel.h>
23	#include <linux/blk_types.h>
24	#include <linux/slab.h>
25	#include <linux/backing-dev.h>
26	#include <linux/swap.h>
27
28	#include "blk-stat.h"
29	#include "blk-wbt.h"
30	#include "blk-rq-qos.h"
31	#include "elevator.h"
32
33	#define CREATE_TRACE_POINTS
34	#include <trace/events/wbt.h>
35
36	enum wbt_flags {
37	WBT_TRACKED = `1`, / write, tracked for throttling /
38	WBT_READ = `2`, / read /
39	WBT_KSWAPD = `4`, / write, from kswapd /
40	WBT_DISCARD = `8`, / discard /
41
42	WBT_NR_BITS = `4`, / number of bits /
43	};
44
45	enum {
46	WBT_RWQ_BG = `0`,
47	WBT_RWQ_KSWAPD,
48	WBT_RWQ_DISCARD,
49	WBT_NUM_RWQ,
50	};
51
52	/*
53	* If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
54	* state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
55	* to WBT_STATE_OFF/ON_MANUAL.
56	*/
57	enum {
58	WBT_STATE_ON_DEFAULT = `1`, / on by default /
59	WBT_STATE_ON_MANUAL = `2`, / on manually by sysfs /
60	WBT_STATE_OFF_DEFAULT = `3`, / off by default /
61	WBT_STATE_OFF_MANUAL = `4`, / off manually by sysfs /
62	};
63
64	struct rq_wb {
65	/*
66	* Settings that govern how we throttle
67	*/
68	unsigned int wb_background; / background writeback /
69	unsigned int wb_normal; / normal writeback /
70
71	short enable_state; / WBT_STATE_* /
72
73	/*
74	* Number of consecutive periods where we don't have enough
75	* information to make a firm scale up/down decision.
76	*/
77	unsigned int unknown_cnt;
78
79	u64 win_nsec; / default window size /
80	u64 cur_win_nsec; / current window size /
81
82	struct blk_stat_callback *cb;
83
84	u64 sync_issue;
85	void *sync_cookie;
86
87	unsigned int wc;
88
89	unsigned long last_issue; / last non-throttled issue /
90	unsigned long last_comp; / last non-throttled comp /
91	unsigned long min_lat_nsec;
92	struct rq_qos rqos;
93	struct rq_wait rq_wait[WBT_NUM_RWQ];
94	struct rq_depth rq_depth;
95	};
96
97	static inline struct rq_wb RQWB(struct* rq_qos *rqos)
98	{
99	return container_of(rqos, struct rq_wb, rqos);
100	}
101
102	static inline void wbt_clear_state(struct request *rq)
103	{
104	rq->wbt_flags = `0`;
105	}
106
107	static inline enum wbt_flags wbt_flags(struct request *rq)
108	{
109	return rq->wbt_flags;
110	}
111
112	static inline bool wbt_is_tracked(struct request *rq)
113	{
114	return rq->wbt_flags & WBT_TRACKED;
115	}
116
117	static inline bool wbt_is_read(struct request *rq)
118	{
119	return rq->wbt_flags & WBT_READ;
120	}
121
122	enum {
123	/*
124	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
125	* from here depending on device stats
126	*/
127	RWB_DEF_DEPTH = `16`,
128
129	/*
130	* 100msec window
131	*/
132	RWB_WINDOW_NSEC = `100` * `1000` * `1000ULL`,
133
134	/*
135	* Disregard stats, if we don't meet this minimum
136	*/
137	RWB_MIN_WRITE_SAMPLES = `3`,
138
139	/*
140	* If we have this number of consecutive windows with not enough
141	* information to scale up or down, scale up.
142	*/
143	RWB_UNKNOWN_BUMP = `5`,
144	};
145
146	static inline bool rwb_enabled(struct rq_wb *rwb)
147	{
148	return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
149	rwb->enable_state != WBT_STATE_OFF_MANUAL;
150	}
151
152	static void wb_timestamp(struct rq_wb rwb, unsigned* long *var)
153	{
154	if (rwb_enabled(rwb)) {
155	const unsigned long cur = jiffies;
156
157	if (cur != *var)
158	*var = cur;
159	}
160	}
161
162	/*
163	* If a task was rate throttled in balance_dirty_pages() within the last
164	* second or so, use that to indicate a higher cleaning rate.
165	*/
166	static bool wb_recent_wait(struct rq_wb *rwb)
167	{
168	struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
169
170	return time_before(jiffies, wb->dirty_sleep + HZ);
171	}
172
173	static inline struct rq_wait get_rq_wait(struct* rq_wb *rwb,
174	enum wbt_flags wb_acct)
175	{
176	if (wb_acct & WBT_KSWAPD)
177	return &rwb->rq_wait[WBT_RWQ_KSWAPD];
178	else if (wb_acct & WBT_DISCARD)
179	return &rwb->rq_wait[WBT_RWQ_DISCARD];
180
181	return &rwb->rq_wait[WBT_RWQ_BG];
182	}
183
184	static void rwb_wake_all(struct rq_wb *rwb)
185	{
186	int i;
187
188	for (i = `0`; i < WBT_NUM_RWQ; i++) {
189	struct rq_wait *rqw = &rwb->rq_wait[i];
190
191	if (wq_has_sleeper(wq_head: &rqw->wait))
192	wake_up_all(&rqw->wait);
193	}
194	}
195
196	static void wbt_rqw_done(struct rq_wb rwb, struct* rq_wait *rqw,
197	enum wbt_flags wb_acct)
198	{
199	int inflight, limit;
200
201	inflight = atomic_dec_return(v: &rqw->inflight);
202
203	/*
204	* For discards, our limit is always the background. For writes, if
205	* the device does write back caching, drop further down before we
206	* wake people up.
207	*/
208	if (wb_acct & WBT_DISCARD)
209	limit = rwb->wb_background;
210	else if (rwb->wc && !wb_recent_wait(rwb))
211	limit = `0`;
212	else
213	limit = rwb->wb_normal;
214
215	/*
216	* Don't wake anyone up if we are above the normal limit.
217	*/
218	if (inflight && inflight >= limit)
219	return;
220
221	if (wq_has_sleeper(wq_head: &rqw->wait)) {
222	int diff = limit - inflight;
223
224	if (!inflight \|\| diff >= rwb->wb_background / `2`)
225	wake_up_all(&rqw->wait);
226	}
227	}
228
229	static void __wbt_done(struct rq_qos rqos, enum* wbt_flags wb_acct)
230	{
231	struct rq_wb *rwb = RQWB(rqos);
232	struct rq_wait *rqw;
233
234	if (!(wb_acct & WBT_TRACKED))
235	return;
236
237	rqw = get_rq_wait(rwb, wb_acct);
238	wbt_rqw_done(rwb, rqw, wb_acct);
239	}
240
241	/*
242	* Called on completion of a request. Note that it's also called when
243	* a request is merged, when the request gets freed.
244	*/
245	static void wbt_done(struct rq_qos rqos, struct* request *rq)
246	{
247	struct rq_wb *rwb = RQWB(rqos);
248
249	if (!wbt_is_tracked(rq)) {
250	if (rwb->sync_cookie == rq) {
251	rwb->sync_issue = `0`;
252	rwb->sync_cookie = NULL;
253	}
254
255	if (wbt_is_read(rq))
256	wb_timestamp(rwb, var: &rwb->last_comp);
257	} else {
258	WARN_ON_ONCE(rq == rwb->sync_cookie);
259	__wbt_done(rqos, wb_acct: wbt_flags(rq));
260	}
261	wbt_clear_state(rq);
262	}
263
264	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
265	{
266	/*
267	* We need at least one read sample, and a minimum of
268	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
269	* that it's writes impacting us, and not just some sole read on
270	* a device that is in a lower power state.
271	*/
272	return (stat[READ].nr_samples >= `1` &&
273	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
274	}
275
276	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
277	{
278	u64 now, issue = READ_ONCE(rwb->sync_issue);
279
280	if (!issue \|\| !rwb->sync_cookie)
281	return `0`;
282
283	now = ktime_to_ns(kt: ktime_get());
284	return now - issue;
285	}
286
287	static inline unsigned int wbt_inflight(struct rq_wb *rwb)
288	{
289	unsigned int i, ret = `0`;
290
291	for (i = `0`; i < WBT_NUM_RWQ; i++)
292	ret += atomic_read(v: &rwb->rq_wait[i].inflight);
293
294	return ret;
295	}
296
297	enum {
298	LAT_OK = `1`,
299	LAT_UNKNOWN,
300	LAT_UNKNOWN_WRITES,
301	LAT_EXCEEDED,
302	};
303
304	static int latency_exceeded(struct rq_wb rwb, struct* blk_rq_stat *stat)
305	{
306	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
307	struct rq_depth *rqd = &rwb->rq_depth;
308	u64 thislat;
309
310	/*
311	* If our stored sync issue exceeds the window size, or it
312	* exceeds our min target AND we haven't logged any entries,
313	* flag the latency as exceeded. wbt works off completion latencies,
314	* but for a flooded device, a single sync IO can take a long time
315	* to complete after being issued. If this time exceeds our
316	* monitoring window AND we didn't see any other completions in that
317	* window, then count that sync IO as a violation of the latency.
318	*/
319	thislat = rwb_sync_issue_lat(rwb);
320	if (thislat > rwb->cur_win_nsec \|\|
321	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
322	trace_wbt_lat(bdi, lat: thislat);
323	return LAT_EXCEEDED;
324	}
325
326	/*
327	* No read/write mix, if stat isn't valid
328	*/
329	if (!stat_sample_valid(stat)) {
330	/*
331	* If we had writes in this stat window and the window is
332	* current, we're only doing writes. If a task recently
333	* waited or still has writes in flights, consider us doing
334	* just writes as well.
335	*/
336	if (stat[WRITE].nr_samples \|\| wb_recent_wait(rwb) \|\|
337	wbt_inflight(rwb))
338	return LAT_UNKNOWN_WRITES;
339	return LAT_UNKNOWN;
340	}
341
342	/*
343	* If the 'min' latency exceeds our target, step down.
344	*/
345	if (stat[READ].min > rwb->min_lat_nsec) {
346	trace_wbt_lat(bdi, lat: stat[READ].min);
347	trace_wbt_stat(bdi, stat);
348	return LAT_EXCEEDED;
349	}
350
351	if (rqd->scale_step)
352	trace_wbt_stat(bdi, stat);
353
354	return LAT_OK;
355	}
356
357	static void rwb_trace_step(struct rq_wb rwb, const* char *msg)
358	{
359	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
360	struct rq_depth *rqd = &rwb->rq_depth;
361
362	trace_wbt_step(bdi, msg, step: rqd->scale_step, window: rwb->cur_win_nsec,
363	bg: rwb->wb_background, normal: rwb->wb_normal, max: rqd->max_depth);
364	}
365
366	static void calc_wb_limits(struct rq_wb *rwb)
367	{
368	if (rwb->min_lat_nsec == `0`) {
369	rwb->wb_normal = rwb->wb_background = `0`;
370	} else if (rwb->rq_depth.max_depth <= `2`) {
371	rwb->wb_normal = rwb->rq_depth.max_depth;
372	rwb->wb_background = `1`;
373	} else {
374	rwb->wb_normal = (rwb->rq_depth.max_depth + `1`) / `2`;
375	rwb->wb_background = (rwb->rq_depth.max_depth + `3`) / `4`;
376	}
377	}
378
379	static void scale_up(struct rq_wb *rwb)
380	{
381	if (!rq_depth_scale_up(rqd: &rwb->rq_depth))
382	return;
383	calc_wb_limits(rwb);
384	rwb->unknown_cnt = `0`;
385	rwb_wake_all(rwb);
386	rwb_trace_step(rwb, tracepoint_string("scale up"));
387	}
388
389	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
390	{
391	if (!rq_depth_scale_down(rqd: &rwb->rq_depth, hard_throttle))
392	return;
393	calc_wb_limits(rwb);
394	rwb->unknown_cnt = `0`;
395	rwb_trace_step(rwb, tracepoint_string("scale down"));
396	}
397
398	static void rwb_arm_timer(struct rq_wb *rwb)
399	{
400	struct rq_depth *rqd = &rwb->rq_depth;
401
402	if (rqd->scale_step > `0`) {
403	/*
404	* We should speed this up, using some variant of a fast
405	* integer inverse square root calculation. Since we only do
406	* this for every window expiration, it's not a huge deal,
407	* though.
408	*/
409	rwb->cur_win_nsec = div_u64(dividend: rwb->win_nsec << `4`,
410	divisor: int_sqrt((rqd->scale_step + `1`) << `8`));
411	} else {
412	/*
413	* For step < 0, we don't want to increase/decrease the
414	* window size.
415	*/
416	rwb->cur_win_nsec = rwb->win_nsec;
417	}
418
419	blk_stat_activate_nsecs(cb: rwb->cb, nsecs: rwb->cur_win_nsec);
420	}
421
422	static void wb_timer_fn(struct blk_stat_callback *cb)
423	{
424	struct rq_wb *rwb = cb->data;
425	struct rq_depth *rqd = &rwb->rq_depth;
426	unsigned int inflight = wbt_inflight(rwb);
427	int status;
428
429	if (!rwb->rqos.disk)
430	return;
431
432	status = latency_exceeded(rwb, stat: cb->stat);
433
434	trace_wbt_timer(bdi: rwb->rqos.disk->bdi, status, step: rqd->scale_step, inflight);
435
436	/*
437	* If we exceeded the latency target, step down. If we did not,
438	* step one level up. If we don't know enough to say either exceeded
439	* or ok, then don't do anything.
440	*/
441	switch (status) {
442	case LAT_EXCEEDED:
443	scale_down(rwb, hard_throttle: true);
444	break;
445	case LAT_OK:
446	scale_up(rwb);
447	break;
448	case LAT_UNKNOWN_WRITES:
449	/*
450	* We started a the center step, but don't have a valid
451	* read/write sample, but we do have writes going on.
452	* Allow step to go negative, to increase write perf.
453	*/
454	scale_up(rwb);
455	break;
456	case LAT_UNKNOWN:
457	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
458	break;
459	/*
460	* We get here when previously scaled reduced depth, and we
461	* currently don't have a valid read/write sample. For that
462	* case, slowly return to center state (step == 0).
463	*/
464	if (rqd->scale_step > `0`)
465	scale_up(rwb);
466	else if (rqd->scale_step < `0`)
467	scale_down(rwb, hard_throttle: false);
468	break;
469	default:
470	break;
471	}
472
473	/*
474	* Re-arm timer, if we have IO in flight
475	*/
476	if (rqd->scale_step \|\| inflight)
477	rwb_arm_timer(rwb);
478	}
479
480	static void wbt_update_limits(struct rq_wb *rwb)
481	{
482	struct rq_depth *rqd = &rwb->rq_depth;
483
484	rqd->scale_step = `0`;
485	rqd->scaled_max = false;
486
487	rq_depth_calc_max_depth(rqd);
488	calc_wb_limits(rwb);
489
490	rwb_wake_all(rwb);
491	}
492
493	bool wbt_disabled(struct request_queue *q)
494	{
495	struct rq_qos *rqos = wbt_rq_qos(q);
496
497	return !rqos \|\| !rwb_enabled(rwb: RQWB(rqos));
498	}
499
500	u64 wbt_get_min_lat(struct request_queue *q)
501	{
502	struct rq_qos *rqos = wbt_rq_qos(q);
503	if (!rqos)
504	return `0`;
505	return RQWB(rqos)->min_lat_nsec;
506	}
507
508	void wbt_set_min_lat(struct request_queue *q, u64 val)
509	{
510	struct rq_qos *rqos = wbt_rq_qos(q);
511	if (!rqos)
512	return;
513
514	RQWB(rqos)->min_lat_nsec = val;
515	if (val)
516	RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
517	else
518	RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
519
520	wbt_update_limits(rwb: RQWB(rqos));
521	}
522
523
524	static bool close_io(struct rq_wb *rwb)
525	{
526	const unsigned long now = jiffies;
527
528	return time_before(now, rwb->last_issue + HZ / `10`) \|\|
529	time_before(now, rwb->last_comp + HZ / `10`);
530	}
531
532	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
533
534	static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
535	{
536	unsigned int limit;
537
538	if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
539	return rwb->wb_background;
540
541	/*
542	* At this point we know it's a buffered write. If this is
543	* kswapd trying to free memory, or REQ_SYNC is set, then
544	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
545	* that. If the write is marked as a background write, then use
546	* the idle limit, or go to normal if we haven't had competing
547	* IO for a bit.
548	*/
549	if ((opf & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
550	limit = rwb->rq_depth.max_depth;
551	else if ((opf & REQ_BACKGROUND) \|\| close_io(rwb)) {
552	/*
553	* If less than 100ms since we completed unrelated IO,
554	* limit us to half the depth for background writeback.
555	*/
556	limit = rwb->wb_background;
557	} else
558	limit = rwb->wb_normal;
559
560	return limit;
561	}
562
563	struct wbt_wait_data {
564	struct rq_wb *rwb;
565	enum wbt_flags wb_acct;
566	blk_opf_t opf;
567	};
568
569	static bool wbt_inflight_cb(struct rq_wait rqw, void* *private_data)
570	{
571	struct wbt_wait_data *data = private_data;
572	return rq_wait_inc_below(rq_wait: rqw, limit: get_limit(rwb: data->rwb, opf: data->opf));
573	}
574
575	static void wbt_cleanup_cb(struct rq_wait rqw, void* *private_data)
576	{
577	struct wbt_wait_data *data = private_data;
578	wbt_rqw_done(rwb: data->rwb, rqw, wb_acct: data->wb_acct);
579	}
580
581	/*
582	* Block if we will exceed our limit, or if we are currently waiting for
583	* the timer to kick off queuing again.
584	*/
585	static void __wbt_wait(struct rq_wb rwb, enum* wbt_flags wb_acct,
586	blk_opf_t opf)
587	{
588	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
589	struct wbt_wait_data data = {
590	.rwb = rwb,
591	.wb_acct = wb_acct,
592	.opf = opf,
593	};
594
595	rq_qos_wait(rqw, private_data: &data, acquire_inflight_cb: wbt_inflight_cb, cleanup_cb: wbt_cleanup_cb);
596	}
597
598	static inline bool wbt_should_throttle(struct bio *bio)
599	{
600	switch (bio_op(bio)) {
601	case REQ_OP_WRITE:
602	/*
603	* Don't throttle WRITE_ODIRECT
604	*/
605	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) ==
606	(REQ_SYNC \| REQ_IDLE))
607	return false;
608	fallthrough;
609	case REQ_OP_DISCARD:
610	return true;
611	default:
612	return false;
613	}
614	}
615
616	static enum wbt_flags bio_to_wbt_flags(struct rq_wb rwb, struct* bio *bio)
617	{
618	enum wbt_flags flags = `0`;
619
620	if (!rwb_enabled(rwb))
621	return `0`;
622
623	if (bio_op(bio) == REQ_OP_READ) {
624	flags = WBT_READ;
625	} else if (wbt_should_throttle(bio)) {
626	if (current_is_kswapd())
627	flags \|= WBT_KSWAPD;
628	if (bio_op(bio) == REQ_OP_DISCARD)
629	flags \|= WBT_DISCARD;
630	flags \|= WBT_TRACKED;
631	}
632	return flags;
633	}
634
635	static void wbt_cleanup(struct rq_qos rqos, struct* bio *bio)
636	{
637	struct rq_wb *rwb = RQWB(rqos);
638	enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
639	__wbt_done(rqos, wb_acct: flags);
640	}
641
642	/*
643	* May sleep, if we have exceeded the writeback limits. Caller can pass
644	* in an irq held spinlock, if it holds one when calling this function.
645	* If we do sleep, we'll release and re-grab it.
646	*/
647	static void wbt_wait(struct rq_qos rqos, struct* bio *bio)
648	{
649	struct rq_wb *rwb = RQWB(rqos);
650	enum wbt_flags flags;
651
652	flags = bio_to_wbt_flags(rwb, bio);
653	if (!(flags & WBT_TRACKED)) {
654	if (flags & WBT_READ)
655	wb_timestamp(rwb, var: &rwb->last_issue);
656	return;
657	}
658
659	__wbt_wait(rwb, wb_acct: flags, opf: bio->bi_opf);
660
661	if (!blk_stat_is_active(cb: rwb->cb))
662	rwb_arm_timer(rwb);
663	}
664
665	static void wbt_track(struct rq_qos rqos, struct* request rq, struct* bio *bio)
666	{
667	struct rq_wb *rwb = RQWB(rqos);
668	rq->wbt_flags \|= bio_to_wbt_flags(rwb, bio);
669	}
670
671	static void wbt_issue(struct rq_qos rqos, struct* request *rq)
672	{
673	struct rq_wb *rwb = RQWB(rqos);
674
675	if (!rwb_enabled(rwb))
676	return;
677
678	/*
679	* Track sync issue, in case it takes a long time to complete. Allows us
680	* to react quicker, if a sync IO takes a long time to complete. Note
681	* that this is just a hint. The request can go away when it completes,
682	* so it's important we never dereference it. We only use the address to
683	* compare with, which is why we store the sync_issue time locally.
684	*/
685	if (wbt_is_read(rq) && !rwb->sync_issue) {
686	rwb->sync_cookie = rq;
687	rwb->sync_issue = rq->io_start_time_ns;
688	}
689	}
690
691	static void wbt_requeue(struct rq_qos rqos, struct* request *rq)
692	{
693	struct rq_wb *rwb = RQWB(rqos);
694	if (!rwb_enabled(rwb))
695	return;
696	if (rq == rwb->sync_cookie) {
697	rwb->sync_issue = `0`;
698	rwb->sync_cookie = NULL;
699	}
700	}
701
702	void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
703	{
704	struct rq_qos *rqos = wbt_rq_qos(q);
705	if (rqos)
706	RQWB(rqos)->wc = write_cache_on;
707	}
708
709	/*
710	* Enable wbt if defaults are configured that way
711	*/
712	void wbt_enable_default(struct gendisk *disk)
713	{
714	struct request_queue *q = disk->queue;
715	struct rq_qos *rqos;
716	bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
717
718	if (q->elevator &&
719	test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
720	enable = false;
721
722	/ Throttling already enabled? /
723	rqos = wbt_rq_qos(q);
724	if (rqos) {
725	if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
726	RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
727	return;
728	}
729
730	/ Queue not registered? Maybe shutting down... /
731	if (!blk_queue_registered(q))
732	return;
733
734	if (queue_is_mq(q) && enable)
735	wbt_init(disk);
736	}
737	EXPORT_SYMBOL_GPL(wbt_enable_default);
738
739	u64 wbt_default_latency_nsec(struct request_queue *q)
740	{
741	/*
742	* We default to 2msec for non-rotational storage, and 75msec
743	* for rotational storage.
744	*/
745	if (blk_queue_nonrot(q))
746	return `2000000ULL`;
747	else
748	return `75000000ULL`;
749	}
750
751	static int wbt_data_dir(const struct request *rq)
752	{
753	const enum req_op op = req_op(req: rq);
754
755	if (op == REQ_OP_READ)
756	return READ;
757	else if (op_is_write(op))
758	return WRITE;
759
760	/ don't account /
761	return -`1`;
762	}
763
764	static void wbt_queue_depth_changed(struct rq_qos *rqos)
765	{
766	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(q: rqos->disk->queue);
767	wbt_update_limits(rwb: RQWB(rqos));
768	}
769
770	static void wbt_exit(struct rq_qos *rqos)
771	{
772	struct rq_wb *rwb = RQWB(rqos);
773
774	blk_stat_remove_callback(q: rqos->disk->queue, cb: rwb->cb);
775	blk_stat_free_callback(cb: rwb->cb);
776	kfree(objp: rwb);
777	}
778
779	/*
780	* Disable wbt, if enabled by default.
781	*/
782	void wbt_disable_default(struct gendisk *disk)
783	{
784	struct rq_qos *rqos = wbt_rq_qos(q: disk->queue);
785	struct rq_wb *rwb;
786	if (!rqos)
787	return;
788	rwb = RQWB(rqos);
789	if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
790	blk_stat_deactivate(cb: rwb->cb);
791	rwb->enable_state = WBT_STATE_OFF_DEFAULT;
792	}
793	}
794	EXPORT_SYMBOL_GPL(wbt_disable_default);
795
796	#ifdef CONFIG_BLK_DEBUG_FS
797	static int wbt_curr_win_nsec_show(void data, struct* seq_file *m)
798	{
799	struct rq_qos *rqos = data;
800	struct rq_wb *rwb = RQWB(rqos);
801
802	seq_printf(m, fmt: "%llu\n", rwb->cur_win_nsec);
803	return `0`;
804	}
805
806	static int wbt_enabled_show(void data, struct* seq_file *m)
807	{
808	struct rq_qos *rqos = data;
809	struct rq_wb *rwb = RQWB(rqos);
810
811	seq_printf(m, fmt: "%d\n", rwb->enable_state);
812	return `0`;
813	}
814
815	static int wbt_id_show(void data, struct* seq_file *m)
816	{
817	struct rq_qos *rqos = data;
818
819	seq_printf(m, fmt: "%u\n", rqos->id);
820	return `0`;
821	}
822
823	static int wbt_inflight_show(void data, struct* seq_file *m)
824	{
825	struct rq_qos *rqos = data;
826	struct rq_wb *rwb = RQWB(rqos);
827	int i;
828
829	for (i = `0`; i < WBT_NUM_RWQ; i++)
830	seq_printf(m, fmt: "%d: inflight %d\n", i,
831	atomic_read(v: &rwb->rq_wait[i].inflight));
832	return `0`;
833	}
834
835	static int wbt_min_lat_nsec_show(void data, struct* seq_file *m)
836	{
837	struct rq_qos *rqos = data;
838	struct rq_wb *rwb = RQWB(rqos);
839
840	seq_printf(m, fmt: "%lu\n", rwb->min_lat_nsec);
841	return `0`;
842	}
843
844	static int wbt_unknown_cnt_show(void data, struct* seq_file *m)
845	{
846	struct rq_qos *rqos = data;
847	struct rq_wb *rwb = RQWB(rqos);
848
849	seq_printf(m, fmt: "%u\n", rwb->unknown_cnt);
850	return `0`;
851	}
852
853	static int wbt_normal_show(void data, struct* seq_file *m)
854	{
855	struct rq_qos *rqos = data;
856	struct rq_wb *rwb = RQWB(rqos);
857
858	seq_printf(m, fmt: "%u\n", rwb->wb_normal);
859	return `0`;
860	}
861
862	static int wbt_background_show(void data, struct* seq_file *m)
863	{
864	struct rq_qos *rqos = data;
865	struct rq_wb *rwb = RQWB(rqos);
866
867	seq_printf(m, fmt: "%u\n", rwb->wb_background);
868	return `0`;
869	}
870
871	static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
872	{"curr_win_nsec", `0400`, wbt_curr_win_nsec_show},
873	{"enabled", `0400`, wbt_enabled_show},
874	{"id", `0400`, wbt_id_show},
875	{"inflight", `0400`, wbt_inflight_show},
876	{"min_lat_nsec", `0400`, wbt_min_lat_nsec_show},
877	{"unknown_cnt", `0400`, wbt_unknown_cnt_show},
878	{"wb_normal", `0400`, wbt_normal_show},
879	{"wb_background", `0400`, wbt_background_show},
880	{},
881	};
882	#endif
883
884	static const struct rq_qos_ops wbt_rqos_ops = {
885	.throttle = wbt_wait,
886	.issue = wbt_issue,
887	.track = wbt_track,
888	.requeue = wbt_requeue,
889	.done = wbt_done,
890	.cleanup = wbt_cleanup,
891	.queue_depth_changed = wbt_queue_depth_changed,
892	.exit = wbt_exit,
893	#ifdef CONFIG_BLK_DEBUG_FS
894	.debugfs_attrs = wbt_debugfs_attrs,
895	#endif
896	};
897
898	int wbt_init(struct gendisk *disk)
899	{
900	struct request_queue *q = disk->queue;
901	struct rq_wb *rwb;
902	int i;
903	int ret;
904
905	rwb = kzalloc(size: sizeof(*rwb), GFP_KERNEL);
906	if (!rwb)
907	return -ENOMEM;
908
909	rwb->cb = blk_stat_alloc_callback(timer_fn: wb_timer_fn, bucket_fn: wbt_data_dir, buckets: `2`, data: rwb);
910	if (!rwb->cb) {
911	kfree(objp: rwb);
912	return -ENOMEM;
913	}
914
915	for (i = `0`; i < WBT_NUM_RWQ; i++)
916	rq_wait_init(rq_wait: &rwb->rq_wait[i]);
917
918	rwb->last_comp = rwb->last_issue = jiffies;
919	rwb->win_nsec = RWB_WINDOW_NSEC;
920	rwb->enable_state = WBT_STATE_ON_DEFAULT;
921	rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
922	rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
923	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
924	rwb->rq_depth.queue_depth = blk_queue_depth(q);
925	wbt_update_limits(rwb);
926
927	/*
928	* Assign rwb and add the stats callback.
929	*/
930	mutex_lock(&q->rq_qos_mutex);
931	ret = rq_qos_add(rqos: &rwb->rqos, disk, id: RQ_QOS_WBT, ops: &wbt_rqos_ops);
932	mutex_unlock(lock: &q->rq_qos_mutex);
933	if (ret)
934	goto err_free;
935
936	blk_stat_add_callback(q, cb: rwb->cb);
937
938	return `0`;
939
940	err_free:
941	blk_stat_free_callback(cb: rwb->cb);
942	kfree(objp: rwb);
943	return ret;
944
945	}
946

source code of linux/block/blk-wbt.c