bfq-iosched.h source code [linux/block/bfq-iosched.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/*
3	* Header file for the BFQ I/O scheduler: data structures and
4	* prototypes of interface functions among BFQ components.
5	*/
6	#ifndef _BFQ_H
7	#define _BFQ_H
8
9	#include <linux/blktrace_api.h>
10	#include <linux/hrtimer.h>
11
12	#include "blk-cgroup-rwstat.h"
13
14	#define BFQ_IOPRIO_CLASSES 3
15	#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
16
17	#define BFQ_MIN_WEIGHT 1
18	#define BFQ_MAX_WEIGHT 1000
19	#define BFQ_WEIGHT_CONVERSION_COEFF 10
20
21	#define BFQ_DEFAULT_QUEUE_IOPRIO 4
22
23	#define BFQ_DEFAULT_GRP_IOPRIO 0
24	#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
25
26	#define MAX_BFQQ_NAME_LENGTH 16
27
28	/*
29	* Soft real-time applications are extremely more latency sensitive
30	* than interactive ones. Over-raise the weight of the former to
31	* privilege them against the latter.
32	*/
33	#define BFQ_SOFTRT_WEIGHT_FACTOR 100
34
35	/*
36	* Maximum number of actuators supported. This constant is used simply
37	* to define the size of the static array that will contain
38	* per-actuator data. The current value is hopefully a good upper
39	* bound to the possible number of actuators of any actual drive.
40	*/
41	#define BFQ_MAX_ACTUATORS 8
42
43	struct bfq_entity;
44
45	/**
46	* struct bfq_service_tree - per ioprio_class service tree.
47	*
48	* Each service tree represents a B-WF2Q+ scheduler on its own. Each
49	* ioprio_class has its own independent scheduler, and so its own
50	* bfq_service_tree. All the fields are protected by the queue lock
51	* of the containing bfqd.
52	*/
53	struct bfq_service_tree {
54	/ tree for active entities (i.e., those backlogged) /
55	struct rb_root active;
56	/ tree for idle entities (i.e., not backlogged, with V < F_i)/
57	struct rb_root idle;
58
59	/ idle entity with minimum F_i /
60	struct bfq_entity *first_idle;
61	/ idle entity with maximum F_i /
62	struct bfq_entity *last_idle;
63
64	/ scheduler virtual time /
65	u64 vtime;
66	/ scheduler weight sum; active and idle entities contribute to it /
67	unsigned long wsum;
68	};
69
70	/**
71	* struct bfq_sched_data - multi-class scheduler.
72	*
73	* bfq_sched_data is the basic scheduler queue. It supports three
74	* ioprio_classes, and can be used either as a toplevel queue or as an
75	* intermediate queue in a hierarchical setup.
76	*
77	* The supported ioprio_classes are the same as in CFQ, in descending
78	* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
79	* Requests from higher priority queues are served before all the
80	* requests from lower priority queues; among requests of the same
81	* queue requests are served according to B-WF2Q+.
82	*
83	* The schedule is implemented by the service trees, plus the field
84	* @next_in_service, which points to the entity on the active trees
85	* that will be served next, if 1) no changes in the schedule occurs
86	* before the current in-service entity is expired, 2) the in-service
87	* queue becomes idle when it expires, and 3) if the entity pointed by
88	* in_service_entity is not a queue, then the in-service child entity
89	* of the entity pointed by in_service_entity becomes idle on
90	* expiration. This peculiar definition allows for the following
91	* optimization, not yet exploited: while a given entity is still in
92	* service, we already know which is the best candidate for next
93	* service among the other active entities in the same parent
94	* entity. We can then quickly compare the timestamps of the
95	* in-service entity with those of such best candidate.
96	*
97	* All fields are protected by the lock of the containing bfqd.
98	*/
99	struct bfq_sched_data {
100	/ entity in service /
101	struct bfq_entity *in_service_entity;
102	/ head-of-line entity (see comments above) /
103	struct bfq_entity *next_in_service;
104	/ array of service trees, one per ioprio_class /
105	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
106	/ last time CLASS_IDLE was served /
107	unsigned long bfq_class_idle_last_service;
108
109	};
110
111	/**
112	* struct bfq_weight_counter - counter of the number of all active queues
113	* with a given weight.
114	*/
115	struct bfq_weight_counter {
116	unsigned int weight; / weight of the queues this counter refers to /
117	unsigned int num_active; / nr of active queues with this weight /
118	/*
119	* Weights tree member (see bfq_data's @queue_weights_tree)
120	*/
121	struct rb_node weights_node;
122	};
123
124	/**
125	* struct bfq_entity - schedulable entity.
126	*
127	* A bfq_entity is used to represent either a bfq_queue (leaf node in the
128	* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
129	* entity belongs to the sched_data of the parent group in the cgroup
130	* hierarchy. Non-leaf entities have also their own sched_data, stored
131	* in @my_sched_data.
132	*
133	* Each entity stores independently its priority values; this would
134	* allow different weights on different devices, but this
135	* functionality is not exported to userspace by now. Priorities and
136	* weights are updated lazily, first storing the new values into the
137	* new_* fields, then setting the @prio_changed flag. As soon as
138	* there is a transition in the entity state that allows the priority
139	* update to take place the effective and the requested priority
140	* values are synchronized.
141	*
142	* Unless cgroups are used, the weight value is calculated from the
143	* ioprio to export the same interface as CFQ. When dealing with
144	* "well-behaved" queues (i.e., queues that do not spend too much
145	* time to consume their budget and have true sequential behavior, and
146	* when there are no external factors breaking anticipation) the
147	* relative weights at each level of the cgroups hierarchy should be
148	* guaranteed. All the fields are protected by the queue lock of the
149	* containing bfqd.
150	*/
151	struct bfq_entity {
152	/ service_tree member /
153	struct rb_node rb_node;
154
155	/*
156	* Flag, true if the entity is on a tree (either the active or
157	* the idle one of its service_tree) or is in service.
158	*/
159	bool on_st_or_in_serv;
160
161	/ B-WF2Q+ start and finish timestamps [sectors/weight] /
162	u64 start, finish;
163
164	/ tree the entity is enqueued into; %NULL if not on a tree /
165	struct rb_root *tree;
166
167	/*
168	* minimum start time of the (active) subtree rooted at this
169	* entity; used for O(log N) lookups into active trees
170	*/
171	u64 min_start;
172
173	/ amount of service received during the last service slot /
174	int service;
175
176	/ budget, used also to calculate F_i: F_i = S_i + @budget / @weight /
177	int budget;
178
179	/ Number of requests allocated in the subtree of this entity /
180	int allocated;
181
182	/ device weight, if non-zero, it overrides the default weight of*
183	* bfq_group_data */
184	int dev_weight;
185	/ weight of the queue /
186	int weight;
187	/ next weight if a change is in progress /
188	int new_weight;
189
190	/ original weight, used to implement weight boosting /
191	int orig_weight;
192
193	/ parent entity, for hierarchical scheduling /
194	struct bfq_entity *parent;
195
196	/*
197	* For non-leaf nodes in the hierarchy, the associated
198	* scheduler queue, %NULL on leaf nodes.
199	*/
200	struct bfq_sched_data *my_sched_data;
201	/ the scheduler queue this entity belongs to /
202	struct bfq_sched_data *sched_data;
203
204	/ flag, set to request a weight, ioprio or ioprio_class change /
205	int prio_changed;
206
207	#ifdef CONFIG_BFQ_GROUP_IOSCHED
208	/ flag, set if the entity is counted in groups_with_pending_reqs /
209	bool in_groups_with_pending_reqs;
210	#endif
211
212	/ last child queue of entity created (for non-leaf entities) /
213	struct bfq_queue *last_bfqq_created;
214	};
215
216	struct bfq_group;
217
218	/**
219	* struct bfq_ttime - per process thinktime stats.
220	*/
221	struct bfq_ttime {
222	/ completion time of the last request /
223	u64 last_end_request;
224
225	/ total process thinktime /
226	u64 ttime_total;
227	/ number of thinktime samples /
228	unsigned long ttime_samples;
229	/ average process thinktime /
230	u64 ttime_mean;
231	};
232
233	/**
234	* struct bfq_queue - leaf schedulable entity.
235	*
236	* A bfq_queue is a leaf request queue; it can be associated with an
237	* io_context or more, if it is async or shared between cooperating
238	* processes. Besides, it contains I/O requests for only one actuator
239	* (an io_context is associated with a different bfq_queue for each
240	* actuator it generates I/O for). @cgroup holds a reference to the
241	* cgroup, to be sure that it does not disappear while a bfqq still
242	* references it (mostly to avoid races between request issuing and
243	* task migration followed by cgroup destruction). All the fields are
244	* protected by the queue lock of the containing bfqd.
245	*/
246	struct bfq_queue {
247	/ reference counter /
248	int ref;
249	/ counter of references from other queues for delayed stable merge /
250	int stable_ref;
251	/ parent bfq_data /
252	struct bfq_data *bfqd;
253
254	/ current ioprio and ioprio class /
255	unsigned short ioprio, ioprio_class;
256	/ next ioprio and ioprio class if a change is in progress /
257	unsigned short new_ioprio, new_ioprio_class;
258
259	/ last total-service-time sample, see bfq_update_inject_limit() /
260	u64 last_serv_time_ns;
261	/ limit for request injection /
262	unsigned int inject_limit;
263	/ last time the inject limit has been decreased, in jiffies /
264	unsigned long decrease_time_jif;
265
266	/*
267	* Shared bfq_queue if queue is cooperating with one or more
268	* other queues.
269	*/
270	struct bfq_queue *new_bfqq;
271	/ request-position tree member (see bfq_group's @rq_pos_tree) /
272	struct rb_node pos_node;
273	/ request-position tree root (see bfq_group's @rq_pos_tree) /
274	struct rb_root *pos_root;
275
276	/ sorted list of pending requests /
277	struct rb_root sort_list;
278	/ if fifo isn't expired, next request to serve /
279	struct request *next_rq;
280	/ number of sync and async requests queued /
281	int queued[`2`];
282	/ number of pending metadata requests /
283	int meta_pending;
284	/ fifo list of requests in sort_list /
285	struct list_head fifo;
286
287	/ entity representing this queue in the scheduler /
288	struct bfq_entity entity;
289
290	/ pointer to the weight counter associated with this entity /
291	struct bfq_weight_counter *weight_counter;
292
293	/ maximum budget allowed from the feedback mechanism /
294	int max_budget;
295	/ budget expiration (in jiffies) /
296	unsigned long budget_timeout;
297
298	/ number of requests on the dispatch list or inside driver /
299	int dispatched;
300
301	/ status flags /
302	unsigned long flags;
303
304	/ node for active/idle bfqq list inside parent bfqd /
305	struct list_head bfqq_list;
306
307	/ associated @bfq_ttime struct /
308	struct bfq_ttime ttime;
309
310	/ when bfqq started to do I/O within the last observation window /
311	u64 io_start_time;
312	/ how long bfqq has remained empty during the last observ. window /
313	u64 tot_idle_time;
314
315	/ bit vector: a 1 for each seeky requests in history /
316	u32 seek_history;
317
318	/ node for the device's burst list /
319	struct hlist_node burst_list_node;
320
321	/ position of the last request enqueued /
322	sector_t last_request_pos;
323
324	/ Number of consecutive pairs of request completion and*
325	* arrival, such that the queue becomes idle after the
326	* completion, but the next request arrives within an idle
327	* time slice; used only if the queue's IO_bound flag has been
328	* cleared.
329	*/
330	unsigned int requests_within_timer;
331
332	/ pid of the process owning the queue, used for logging purposes /
333	pid_t pid;
334
335	/*
336	* Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
337	* if the queue is shared.
338	*/
339	struct bfq_io_cq *bic;
340
341	/ current maximum weight-raising time for this queue /
342	unsigned long wr_cur_max_time;
343	/*
344	* Minimum time instant such that, only if a new request is
345	* enqueued after this time instant in an idle @bfq_queue with
346	* no outstanding requests, then the task associated with the
347	* queue it is deemed as soft real-time (see the comments on
348	* the function bfq_bfqq_softrt_next_start())
349	*/
350	unsigned long soft_rt_next_start;
351	/*
352	* Start time of the current weight-raising period if
353	* the @bfq-queue is being weight-raised, otherwise
354	* finish time of the last weight-raising period.
355	*/
356	unsigned long last_wr_start_finish;
357	/ factor by which the weight of this queue is multiplied /
358	unsigned int wr_coeff;
359	/*
360	* Time of the last transition of the @bfq_queue from idle to
361	* backlogged.
362	*/
363	unsigned long last_idle_bklogged;
364	/*
365	* Cumulative service received from the @bfq_queue since the
366	* last transition from idle to backlogged.
367	*/
368	unsigned long service_from_backlogged;
369	/*
370	* Cumulative service received from the @bfq_queue since its
371	* last transition to weight-raised state.
372	*/
373	unsigned long service_from_wr;
374
375	/*
376	* Value of wr start time when switching to soft rt
377	*/
378	unsigned long wr_start_at_switch_to_srt;
379
380	unsigned long split_time; / time of last split /
381
382	unsigned long first_IO_time; / time of first I/O for this queue /
383	unsigned long creation_time; / when this queue is created /
384
385	/*
386	* Pointer to the waker queue for this queue, i.e., to the
387	* queue Q such that this queue happens to get new I/O right
388	* after some I/O request of Q is completed. For details, see
389	* the comments on the choice of the queue for injection in
390	* bfq_select_queue().
391	*/
392	struct bfq_queue *waker_bfqq;
393	/ pointer to the curr. tentative waker queue, see bfq_check_waker() /
394	struct bfq_queue *tentative_waker_bfqq;
395	/ number of times the same tentative waker has been detected /
396	unsigned int num_waker_detections;
397	/ time when we started considering this waker /
398	u64 waker_detection_started;
399
400	/ node for woken_list, see below /
401	struct hlist_node woken_list_node;
402	/*
403	* Head of the list of the woken queues for this queue, i.e.,
404	* of the list of the queues for which this queue is a waker
405	* queue. This list is used to reset the waker_bfqq pointer in
406	* the woken queues when this queue exits.
407	*/
408	struct hlist_head woken_list;
409
410	/ index of the actuator this queue is associated with /
411	unsigned int actuator_idx;
412	};
413
414	/**
415	* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq
416	*/
417	struct bfq_iocq_bfqq_data {
418	/*
419	* Snapshot of the has_short_time flag before merging; taken
420	* to remember its values while the queue is merged, so as to
421	* be able to restore it in case of split.
422	*/
423	bool saved_has_short_ttime;
424	/*
425	* Same purpose as the previous two fields for the I/O bound
426	* classification of a queue.
427	*/
428	bool saved_IO_bound;
429
430	u64 saved_io_start_time;
431	u64 saved_tot_idle_time;
432
433	/*
434	* Same purpose as the previous fields for the values of the
435	* field keeping the queue's belonging to a large burst
436	*/
437	bool saved_in_large_burst;
438	/*
439	* True if the queue belonged to a burst list before its merge
440	* with another cooperating queue.
441	*/
442	bool was_in_burst_list;
443
444	/*
445	* Save the weight when a merge occurs, to be able
446	* to restore it in case of split. If the weight is not
447	* correctly resumed when the queue is recycled,
448	* then the weight of the recycled queue could differ
449	* from the weight of the original queue.
450	*/
451	unsigned int saved_weight;
452
453	/*
454	* Similar to previous fields: save wr information.
455	*/
456	unsigned long saved_wr_coeff;
457	unsigned long saved_last_wr_start_finish;
458	unsigned long saved_service_from_wr;
459	unsigned long saved_wr_start_at_switch_to_srt;
460	unsigned int saved_wr_cur_max_time;
461	struct bfq_ttime saved_ttime;
462
463	/ Save also injection state /
464	u64 saved_last_serv_time_ns;
465	unsigned int saved_inject_limit;
466	unsigned long saved_decrease_time_jif;
467
468	/ candidate queue for a stable merge (due to close creation time) /
469	struct bfq_queue *stable_merge_bfqq;
470
471	bool stably_merged; / non splittable if true /
472	};
473
474	/**
475	* struct bfq_io_cq - per (request_queue, io_context) structure.
476	*/
477	struct bfq_io_cq {
478	/ associated io_cq structure /
479	struct io_cq icq; / must be the first member /
480	/*
481	* Matrix of associated process queues: first row for async
482	* queues, second row sync queues. Each row contains one
483	* column for each actuator. An I/O request generated by the
484	* process is inserted into the queue pointed by bfqq[i][j] if
485	* the request is to be served by the j-th actuator of the
486	* drive, where i==0 or i==1, depending on whether the request
487	* is async or sync. So there is a distinct queue for each
488	* actuator.
489	*/
490	struct bfq_queue *bfqq[`2`][BFQ_MAX_ACTUATORS];
491	/ per (request_queue, blkcg) ioprio /
492	int ioprio;
493	#ifdef CONFIG_BFQ_GROUP_IOSCHED
494	uint64_t blkcg_serial_nr; / the current blkcg serial /
495	#endif
496
497	/*
498	* Persistent data for associated synchronous process queues
499	* (one queue per actuator, see field bfqq above). In
500	* particular, each of these queues may undergo a merge.
501	*/
502	struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];
503
504	unsigned int requests; / Number of requests this process has in flight /
505	};
506
507	/**
508	* struct bfq_data - per-device data structure.
509	*
510	* All the fields are protected by @lock.
511	*/
512	struct bfq_data {
513	/ device request queue /
514	struct request_queue *queue;
515	/ dispatch queue /
516	struct list_head dispatch;
517
518	/ root bfq_group for the device /
519	struct bfq_group *root_group;
520
521	/*
522	* rbtree of weight counters of @bfq_queues, sorted by
523	* weight. Used to keep track of whether all @bfq_queues have
524	* the same weight. The tree contains one counter for each
525	* distinct weight associated to some active and not
526	* weight-raised @bfq_queue (see the comments to the functions
527	* bfq_weights_tree_[add\|remove] for further details).
528	*/
529	struct rb_root_cached queue_weights_tree;
530
531	#ifdef CONFIG_BFQ_GROUP_IOSCHED
532	/*
533	* Number of groups with at least one process that
534	* has at least one request waiting for completion. Note that
535	* this accounts for also requests already dispatched, but not
536	* yet completed. Therefore this number of groups may differ
537	* (be larger) than the number of active groups, as a group is
538	* considered active only if its corresponding entity has
539	* queues with at least one request queued. This
540	* number is used to decide whether a scenario is symmetric.
541	* For a detailed explanation see comments on the computation
542	* of the variable asymmetric_scenario in the function
543	* bfq_better_to_idle().
544	*
545	* However, it is hard to compute this number exactly, for
546	* groups with multiple processes. Consider a group
547	* that is inactive, i.e., that has no process with
548	* pending I/O inside BFQ queues. Then suppose that
549	* num_groups_with_pending_reqs is still accounting for this
550	* group, because the group has processes with some
551	* I/O request still in flight. num_groups_with_pending_reqs
552	* should be decremented when the in-flight request of the
553	* last process is finally completed (assuming that
554	* nothing else has changed for the group in the meantime, in
555	* terms of composition of the group and active/inactive state of child
556	* groups and processes). To accomplish this, an additional
557	* pending-request counter must be added to entities, and must
558	* be updated correctly. To avoid this additional field and operations,
559	* we resort to the following tradeoff between simplicity and
560	* accuracy: for an inactive group that is still counted in
561	* num_groups_with_pending_reqs, we decrement
562	* num_groups_with_pending_reqs when the first
563	* process of the group remains with no request waiting for
564	* completion.
565	*
566	* Even this simpler decrement strategy requires a little
567	* carefulness: to avoid multiple decrements, we flag a group,
568	* more precisely an entity representing a group, as still
569	* counted in num_groups_with_pending_reqs when it becomes
570	* inactive. Then, when the first queue of the
571	* entity remains with no request waiting for completion,
572	* num_groups_with_pending_reqs is decremented, and this flag
573	* is reset. After this flag is reset for the entity,
574	* num_groups_with_pending_reqs won't be decremented any
575	* longer in case a new queue of the entity remains
576	* with no request waiting for completion.
577	*/
578	unsigned int num_groups_with_pending_reqs;
579	#endif
580
581	/*
582	* Per-class (RT, BE, IDLE) number of bfq_queues containing
583	* requests (including the queue in service, even if it is
584	* idling).
585	*/
586	unsigned int busy_queues[`3`];
587	/ number of weight-raised busy @bfq_queues /
588	int wr_busy_queues;
589	/ number of queued requests /
590	int queued;
591	/ number of requests dispatched and waiting for completion /
592	int tot_rq_in_driver;
593	/*
594	* number of requests dispatched and waiting for completion
595	* for each actuator
596	*/
597	int rq_in_driver[BFQ_MAX_ACTUATORS];
598
599	/ true if the device is non rotational and performs queueing /
600	bool nonrot_with_queueing;
601
602	/*
603	* Maximum number of requests in driver in the last
604	* @hw_tag_samples completed requests.
605	*/
606	int max_rq_in_driver;
607	/ number of samples used to calculate hw_tag /
608	int hw_tag_samples;
609	/ flag set to one if the driver is showing a queueing behavior /
610	int hw_tag;
611
612	/ number of budgets assigned /
613	int budgets_assigned;
614
615	/*
616	* Timer set when idling (waiting) for the next request from
617	* the queue in service.
618	*/
619	struct hrtimer idle_slice_timer;
620
621	/ bfq_queue in service /
622	struct bfq_queue *in_service_queue;
623
624	/ on-disk position of the last served request /
625	sector_t last_position;
626
627	/ position of the last served request for the in-service queue /
628	sector_t in_serv_last_pos;
629
630	/ time of last request completion (ns) /
631	u64 last_completion;
632
633	/ bfqq owning the last completed rq /
634	struct bfq_queue *last_completed_rq_bfqq;
635
636	/ last bfqq created, among those in the root group /
637	struct bfq_queue *last_bfqq_created;
638
639	/ time of last transition from empty to non-empty (ns) /
640	u64 last_empty_occupied_ns;
641
642	/*
643	* Flag set to activate the sampling of the total service time
644	* of a just-arrived first I/O request (see
645	* bfq_update_inject_limit()). This will cause the setting of
646	* waited_rq when the request is finally dispatched.
647	*/
648	bool wait_dispatch;
649	/*
650	* If set, then bfq_update_inject_limit() is invoked when
651	* waited_rq is eventually completed.
652	*/
653	struct request *waited_rq;
654	/*
655	* True if some request has been injected during the last service hole.
656	*/
657	bool rqs_injected;
658
659	/ time of first rq dispatch in current observation interval (ns) /
660	u64 first_dispatch;
661	/ time of last rq dispatch in current observation interval (ns) /
662	u64 last_dispatch;
663
664	/ beginning of the last budget /
665	ktime_t last_budget_start;
666	/ beginning of the last idle slice /
667	ktime_t last_idling_start;
668	unsigned long last_idling_start_jiffies;
669
670	/ number of samples in current observation interval /
671	int peak_rate_samples;
672	/ num of samples of seq dispatches in current observation interval /
673	u32 sequential_samples;
674	/ total num of sectors transferred in current observation interval /
675	u64 tot_sectors_dispatched;
676	/ max rq size seen during current observation interval (sectors) /
677	u32 last_rq_max_size;
678	/ time elapsed from first dispatch in current observ. interval (us) /
679	u64 delta_from_first;
680	/*
681	* Current estimate of the device peak rate, measured in
682	* [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by
683	* BFQ_RATE_SHIFT is performed to increase precision in
684	* fixed-point calculations.
685	*/
686	u32 peak_rate;
687
688	/ maximum budget allotted to a bfq_queue before rescheduling /
689	int bfq_max_budget;
690
691	/*
692	* List of all the bfq_queues active for a specific actuator
693	* on the device. Keeping active queues separate on a
694	* per-actuator basis helps implementing per-actuator
695	* injection more efficiently.
696	*/
697	struct list_head active_list[BFQ_MAX_ACTUATORS];
698	/ list of all the bfq_queues idle on the device /
699	struct list_head idle_list;
700
701	/*
702	* Timeout for async/sync requests; when it fires, requests
703	* are served in fifo order.
704	*/
705	u64 bfq_fifo_expire[`2`];
706	/ weight of backward seeks wrt forward ones /
707	unsigned int bfq_back_penalty;
708	/ maximum allowed backward seek /
709	unsigned int bfq_back_max;
710	/ maximum idling time /
711	u32 bfq_slice_idle;
712
713	/ user-configured max budget value (0 for auto-tuning) /
714	int bfq_user_max_budget;
715	/*
716	* Timeout for bfq_queues to consume their budget; used to
717	* prevent seeky queues from imposing long latencies to
718	* sequential or quasi-sequential ones (this also implies that
719	* seeky queues cannot receive guarantees in the service
720	* domain; after a timeout they are charged for the time they
721	* have been in service, to preserve fairness among them, but
722	* without service-domain guarantees).
723	*/
724	unsigned int bfq_timeout;
725
726	/*
727	* Force device idling whenever needed to provide accurate
728	* service guarantees, without caring about throughput
729	* issues. CAVEAT: this may even increase latencies, in case
730	* of useless idling for processes that did stop doing I/O.
731	*/
732	bool strict_guarantees;
733
734	/*
735	* Last time at which a queue entered the current burst of
736	* queues being activated shortly after each other; for more
737	* details about this and the following parameters related to
738	* a burst of activations, see the comments on the function
739	* bfq_handle_burst.
740	*/
741	unsigned long last_ins_in_burst;
742	/*
743	* Reference time interval used to decide whether a queue has
744	* been activated shortly after @last_ins_in_burst.
745	*/
746	unsigned long bfq_burst_interval;
747	/ number of queues in the current burst of queue activations /
748	int burst_size;
749
750	/ common parent entity for the queues in the burst /
751	struct bfq_entity *burst_parent_entity;
752	/ Maximum burst size above which the current queue-activation*
753	* burst is deemed as 'large'.
754	*/
755	unsigned long bfq_large_burst_thresh;
756	/ true if a large queue-activation burst is in progress /
757	bool large_burst;
758	/*
759	* Head of the burst list (as for the above fields, more
760	* details in the comments on the function bfq_handle_burst).
761	*/
762	struct hlist_head burst_list;
763
764	/ if set to true, low-latency heuristics are enabled /
765	bool low_latency;
766	/*
767	* Maximum factor by which the weight of a weight-raised queue
768	* is multiplied.
769	*/
770	unsigned int bfq_wr_coeff;
771
772	/ Maximum weight-raising duration for soft real-time processes /
773	unsigned int bfq_wr_rt_max_time;
774	/*
775	* Minimum idle period after which weight-raising may be
776	* reactivated for a queue (in jiffies).
777	*/
778	unsigned int bfq_wr_min_idle_time;
779	/*
780	* Minimum period between request arrivals after which
781	* weight-raising may be reactivated for an already busy async
782	* queue (in jiffies).
783	*/
784	unsigned long bfq_wr_min_inter_arr_async;
785
786	/ Max service-rate for a soft real-time queue, in sectors/sec /
787	unsigned int bfq_wr_max_softrt_rate;
788	/*
789	* Cached value of the product ref_rate*ref_wr_duration, used
790	* for computing the maximum duration of weight raising
791	* automatically.
792	*/
793	u64 rate_dur_prod;
794
795	/ fallback dummy bfqq for extreme OOM conditions /
796	struct bfq_queue oom_bfqq;
797
798	spinlock_t lock;
799
800	/*
801	* bic associated with the task issuing current bio for
802	* merging. This and the next field are used as a support to
803	* be able to perform the bic lookup, needed by bio-merge
804	* functions, before the scheduler lock is taken, and thus
805	* avoid taking the request-queue lock while the scheduler
806	* lock is being held.
807	*/
808	struct bfq_io_cq *bio_bic;
809	/ bfqq associated with the task issuing current bio for merging /
810	struct bfq_queue *bio_bfqq;
811
812	/*
813	* Depth limits used in bfq_limit_depth (see comments on the
814	* function)
815	*/
816	unsigned int word_depths[`2`][`2`];
817	unsigned int full_depth_shift;
818
819	/*
820	* Number of independent actuators. This is equal to 1 in
821	* case of single-actuator drives.
822	*/
823	unsigned int num_actuators;
824	/*
825	* Disk independent access ranges for each actuator
826	* in this device.
827	*/
828	sector_t sector[BFQ_MAX_ACTUATORS];
829	sector_t nr_sectors[BFQ_MAX_ACTUATORS];
830	struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
831
832	/*
833	* If the number of I/O requests queued in the device for a
834	* given actuator is below next threshold, then the actuator
835	* is deemed as underutilized. If this condition is found to
836	* hold for some actuator upon a dispatch, but (i) the
837	* in-service queue does not contain I/O for that actuator,
838	* while (ii) some other queue does contain I/O for that
839	* actuator, then the head I/O request of the latter queue is
840	* returned (injected), instead of the head request of the
841	* currently in-service queue.
842	*
843	* We set the threshold, empirically, to the minimum possible
844	* value for which an actuator is fully utilized, or close to
845	* be fully utilized. By doing so, injected I/O 'steals' as
846	* few drive-queue slots as possibile to the in-service
847	* queue. This reduces as much as possible the probability
848	* that the service of I/O from the in-service bfq_queue gets
849	* delayed because of slot exhaustion, i.e., because all the
850	* slots of the drive queue are filled with I/O injected from
851	* other queues (NCQ provides for 32 slots).
852	*/
853	unsigned int actuator_load_threshold;
854	};
855
856	enum bfqq_state_flags {
857	BFQQF_just_created = `0`, / queue just allocated /
858	BFQQF_busy, / has requests or is in service /
859	BFQQF_wait_request, / waiting for a request /
860	BFQQF_non_blocking_wait_rq, /*
861	* waiting for a request
862	* without idling the device
863	*/
864	BFQQF_fifo_expire, / FIFO checked in this slice /
865	BFQQF_has_short_ttime, / queue has a short think time /
866	BFQQF_sync, / synchronous queue /
867	BFQQF_IO_bound, /*
868	* bfqq has timed-out at least once
869	* having consumed at most 2/10 of
870	* its budget
871	*/
872	BFQQF_in_large_burst, /*
873	* bfqq activated in a large burst,
874	* see comments to bfq_handle_burst.
875	*/
876	BFQQF_softrt_update, /*
877	* may need softrt-next-start
878	* update
879	*/
880	BFQQF_coop, / bfqq is shared /
881	BFQQF_split_coop, / shared bfqq will be split /
882	};
883
884	#define BFQ_BFQQ_FNS(name) \
885	void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \
886	void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \
887	int bfq_bfqq_##name(const struct bfq_queue *bfqq);
888
889	BFQ_BFQQ_FNS(just_created);
890	BFQ_BFQQ_FNS(busy);
891	BFQ_BFQQ_FNS(wait_request);
892	BFQ_BFQQ_FNS(non_blocking_wait_rq);
893	BFQ_BFQQ_FNS(fifo_expire);
894	BFQ_BFQQ_FNS(has_short_ttime);
895	BFQ_BFQQ_FNS(sync);
896	BFQ_BFQQ_FNS(IO_bound);
897	BFQ_BFQQ_FNS(in_large_burst);
898	BFQ_BFQQ_FNS(coop);
899	BFQ_BFQQ_FNS(split_coop);
900	BFQ_BFQQ_FNS(softrt_update);
901	#undef BFQ_BFQQ_FNS
902
903	/ Expiration reasons. /
904	enum bfqq_expiration {
905	BFQQE_TOO_IDLE = `0`, /*
906	* queue has been idling for
907	* too long
908	*/
909	BFQQE_BUDGET_TIMEOUT, / budget took too long to be used /
910	BFQQE_BUDGET_EXHAUSTED, / budget consumed /
911	BFQQE_NO_MORE_REQUESTS, / the queue has no more requests /
912	BFQQE_PREEMPTED / preemption in progress /
913	};
914
915	struct bfq_stat {
916	struct percpu_counter cpu_cnt;
917	atomic64_t aux_cnt;
918	};
919
920	struct bfqg_stats {
921	/ basic stats /
922	struct blkg_rwstat bytes;
923	struct blkg_rwstat ios;
924	#ifdef CONFIG_BFQ_CGROUP_DEBUG
925	/ number of ios merged /
926	struct blkg_rwstat merged;
927	/ total time spent on device in ns, may not be accurate w/ queueing /
928	struct blkg_rwstat service_time;
929	/ total time spent waiting in scheduler queue in ns /
930	struct blkg_rwstat wait_time;
931	/ number of IOs queued up /
932	struct blkg_rwstat queued;
933	/ total disk time and nr sectors dispatched by this group /
934	struct bfq_stat time;
935	/ sum of number of ios queued across all samples /
936	struct bfq_stat avg_queue_size_sum;
937	/ count of samples taken for average /
938	struct bfq_stat avg_queue_size_samples;
939	/ how many times this group has been removed from service tree /
940	struct bfq_stat dequeue;
941	/ total time spent waiting for it to be assigned a timeslice. /
942	struct bfq_stat group_wait_time;
943	/ time spent idling for this blkcg_gq /
944	struct bfq_stat idle_time;
945	/ total time with empty current active q with other requests queued /
946	struct bfq_stat empty_time;
947	/ fields after this shouldn't be cleared on stat reset /
948	u64 start_group_wait_time;
949	u64 start_idle_time;
950	u64 start_empty_time;
951	uint16_t flags;
952	#endif /* CONFIG_BFQ_CGROUP_DEBUG */
953	};
954
955	#ifdef CONFIG_BFQ_GROUP_IOSCHED
956
957	/*
958	* struct bfq_group_data - per-blkcg storage for the blkio subsystem.
959	*
960	* @ps: @blkcg_policy_storage that this structure inherits
961	* @weight: weight of the bfq_group
962	*/
963	struct bfq_group_data {
964	/ must be the first member /
965	struct blkcg_policy_data pd;
966
967	unsigned int weight;
968	};
969
970	/**
971	* struct bfq_group - per (device, cgroup) data structure.
972	* @entity: schedulable entity to insert into the parent group sched_data.
973	* @sched_data: own sched_data, to contain child entities (they may be
974	* both bfq_queues and bfq_groups).
975	* @bfqd: the bfq_data for the device this group acts upon.
976	* @async_bfqq: array of async queues for all the tasks belonging to
977	* the group, one queue per ioprio value per ioprio_class,
978	* except for the idle class that has only one queue.
979	* @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
980	* @my_entity: pointer to @entity, %NULL for the toplevel group; used
981	* to avoid too many special cases during group creation/
982	* migration.
983	* @stats: stats for this bfqg.
984	* @active_entities: number of active entities belonging to the group;
985	* unused for the root group. Used to know whether there
986	* are groups with more than one active @bfq_entity
987	* (see the comments to the function
988	* bfq_bfqq_may_idle()).
989	* @rq_pos_tree: rbtree sorted by next_request position, used when
990	* determining if two or more queues have interleaving
991	* requests (see bfq_find_close_cooperator()).
992	*
993	* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
994	* there is a set of bfq_groups, each one collecting the lower-level
995	* entities belonging to the group that are acting on the same device.
996	*
997	* Locking works as follows:
998	* o @bfqd is protected by the queue lock, RCU is used to access it
999	* from the readers.
1000	* o All the other fields are protected by the @bfqd queue lock.
1001	*/
1002	struct bfq_group {
1003	/ must be the first member /
1004	struct blkg_policy_data pd;
1005
1006	/ cached path for this blkg (see comments in bfq_bic_update_cgroup) /
1007	char blkg_path[`128`];
1008
1009	/ reference counter (see comments in bfq_bic_update_cgroup) /
1010	refcount_t ref;
1011
1012	struct bfq_entity entity;
1013	struct bfq_sched_data sched_data;
1014
1015	struct bfq_data *bfqd;
1016
1017	struct bfq_queue *async_bfqq[`2`][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
1018	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
1019
1020	struct bfq_entity *my_entity;
1021
1022	int active_entities;
1023	int num_queues_with_pending_reqs;
1024
1025	struct rb_root rq_pos_tree;
1026
1027	struct bfqg_stats stats;
1028	};
1029
1030	#else
1031	struct bfq_group {
1032	struct bfq_entity entity;
1033	struct bfq_sched_data sched_data;
1034
1035	struct bfq_queue *async_bfqq[`2`][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
1036	struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
1037
1038	struct rb_root rq_pos_tree;
1039	};
1040	#endif
1041
1042	/ --------------- main algorithm interface ----------------- /
1043
1044	#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
1045	{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
1046
1047	extern const int bfq_timeout;
1048
1049	struct bfq_queue bic_to_bfqq(struct* bfq_io_cq *bic, bool is_sync,
1050	unsigned int actuator_idx);
1051	void bic_set_bfqq(struct bfq_io_cq bic, struct* bfq_queue *bfqq, bool is_sync,
1052	unsigned int actuator_idx);
1053	struct bfq_data bic_to_bfqd(struct* bfq_io_cq *bic);
1054	void bfq_pos_tree_add_move(struct bfq_data bfqd, struct* bfq_queue *bfqq);
1055	void bfq_weights_tree_add(struct bfq_queue *bfqq);
1056	void bfq_weights_tree_remove(struct bfq_queue *bfqq);
1057	void bfq_bfqq_expire(struct bfq_data bfqd, struct* bfq_queue *bfqq,
1058	bool compensate, enum bfqq_expiration reason);
1059	void bfq_put_queue(struct bfq_queue *bfqq);
1060	void bfq_put_cooperator(struct bfq_queue *bfqq);
1061	void bfq_end_wr_async_queues(struct bfq_data bfqd, struct* bfq_group *bfqg);
1062	void bfq_release_process_ref(struct bfq_data bfqd, struct* bfq_queue *bfqq);
1063	void bfq_schedule_dispatch(struct bfq_data *bfqd);
1064	void bfq_put_async_queues(struct bfq_data bfqd, struct* bfq_group *bfqg);
1065
1066	/ ------------ end of main algorithm interface -------------- /
1067
1068	/ ---------------- cgroups-support interface ---------------- /
1069
1070	void bfqg_stats_update_legacy_io(struct request_queue q, struct* request *rq);
1071	void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
1072	void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
1073	void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
1074	u64 io_start_time_ns, blk_opf_t opf);
1075	void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
1076	void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
1077	void bfq_bfqq_move(struct bfq_data bfqd, struct* bfq_queue *bfqq,
1078	struct bfq_group *bfqg);
1079
1080	#ifdef CONFIG_BFQ_CGROUP_DEBUG
1081	void bfqg_stats_update_io_add(struct bfq_group bfqg, struct* bfq_queue *bfqq,
1082	blk_opf_t opf);
1083	void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
1084	void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
1085	void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
1086	#endif
1087
1088	void bfq_init_entity(struct bfq_entity entity, struct* bfq_group *bfqg);
1089	void bfq_bic_update_cgroup(struct bfq_io_cq bic, struct* bio *bio);
1090	void bfq_end_wr_async(struct bfq_data *bfqd);
1091	struct bfq_group bfq_bio_bfqg(struct* bfq_data bfqd, struct* bio *bio);
1092	struct blkcg_gq bfqg_to_blkg(struct* bfq_group *bfqg);
1093	struct bfq_group bfqq_group(struct* bfq_queue *bfqq);
1094	struct bfq_group bfq_create_group_hierarchy(struct* bfq_data bfqd, int* node);
1095	void bfqg_and_blkg_put(struct bfq_group *bfqg);
1096
1097	#ifdef CONFIG_BFQ_GROUP_IOSCHED
1098	extern struct cftype bfq_blkcg_legacy_files[];
1099	extern struct cftype bfq_blkg_files[];
1100	extern struct blkcg_policy blkcg_policy_bfq;
1101	#endif
1102
1103	/ ------------- end of cgroups-support interface ------------- /
1104
1105	/ - interface of the internal hierarchical B-WF2Q+ scheduler - /
1106
1107	#ifdef CONFIG_BFQ_GROUP_IOSCHED
1108	/ both next loops stop at one of the child entities of the root group /
1109	#define for_each_entity(entity) \
1110	for (; entity ; entity = entity->parent)
1111
1112	/*
1113	* For each iteration, compute parent in advance, so as to be safe if
1114	* entity is deallocated during the iteration. Such a deallocation may
1115	* happen as a consequence of a bfq_put_queue that frees the bfq_queue
1116	* containing entity.
1117	*/
1118	#define for_each_entity_safe(entity, parent) \
1119	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
1120
1121	#else /* CONFIG_BFQ_GROUP_IOSCHED */
1122	/*
1123	* Next two macros are fake loops when cgroups support is not
1124	* enabled. I fact, in such a case, there is only one level to go up
1125	* (to reach the root group).
1126	*/
1127	#define for_each_entity(entity) \
1128	for (; entity ; entity = NULL)
1129
1130	#define for_each_entity_safe(entity, parent) \
1131	for (parent = NULL; entity ; entity = parent)
1132	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1133
1134	struct bfq_queue bfq_entity_to_bfqq(struct* bfq_entity *entity);
1135	unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
1136	struct bfq_service_tree bfq_entity_service_tree(struct* bfq_entity *entity);
1137	struct bfq_entity bfq_entity_of(struct* rb_node *node);
1138	unsigned short bfq_ioprio_to_weight(int ioprio);
1139	void bfq_put_idle_entity(struct bfq_service_tree *st,
1140	struct bfq_entity *entity);
1141	struct bfq_service_tree *
1142	__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1143	struct bfq_entity *entity,
1144	bool update_class_too);
1145	void bfq_bfqq_served(struct bfq_queue bfqq, int* served);
1146	void bfq_bfqq_charge_time(struct bfq_data bfqd, struct* bfq_queue *bfqq,
1147	unsigned long time_ms);
1148	bool __bfq_deactivate_entity(struct bfq_entity *entity,
1149	bool ins_into_idle_tree);
1150	bool next_queue_may_preempt(struct bfq_data *bfqd);
1151	struct bfq_queue bfq_get_next_queue(struct* bfq_data *bfqd);
1152	bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
1153	void bfq_deactivate_bfqq(struct bfq_data bfqd, struct* bfq_queue *bfqq,
1154	bool ins_into_idle_tree, bool expiration);
1155	void bfq_activate_bfqq(struct bfq_data bfqd, struct* bfq_queue *bfqq);
1156	void bfq_requeue_bfqq(struct bfq_data bfqd, struct* bfq_queue *bfqq,
1157	bool expiration);
1158	void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
1159	void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
1160	void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
1161	void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
1162
1163	/ --------------- end of interface of B-WF2Q+ ---------------- /
1164
1165	/ Logging facilities. /
1166	static inline void bfq_bfqq_name(struct bfq_queue bfqq, char* str, int* len)
1167	{
1168	char type = bfq_bfqq_sync(bfqq) ? `'S'` : `'A'`;
1169
1170	if (bfqq->pid != -`1`)
1171	snprintf(buf: str, size: len, fmt: "bfq%d%c", bfqq->pid, type);
1172	else
1173	snprintf(buf: str, size: len, fmt: "bfqSHARED-%c", type);
1174	}
1175
1176	#ifdef CONFIG_BFQ_GROUP_IOSCHED
1177	struct bfq_group bfqq_group(struct* bfq_queue *bfqq);
1178
1179	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1180	char pid_str[MAX_BFQQ_NAME_LENGTH]; \
1181	if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
1182	break; \
1183	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
1184	blk_add_cgroup_trace_msg((bfqd)->queue, \
1185	&bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \
1186	"%s " fmt, pid_str, ##args); \
1187	} while (0)
1188
1189	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
1190	blk_add_cgroup_trace_msg((bfqd)->queue, \
1191	&bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \
1192	} while (0)
1193
1194	#else /* CONFIG_BFQ_GROUP_IOSCHED */
1195
1196	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
1197	char pid_str[MAX_BFQQ_NAME_LENGTH]; \
1198	if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
1199	break; \
1200	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
1201	blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
1202	} while (0)
1203	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
1204
1205	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
1206
1207	#define bfq_log(bfqd, fmt, args...) \
1208	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
1209
1210	#endif /* _BFQ_H */
1211

source code of linux/block/bfq-iosched.h