tree.c source code [linux/kernel/rcu/tree.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* Read-Copy Update mechanism for mutual exclusion (tree-based version)
4	*
5	* Copyright IBM Corporation, 2008
6	*
7	* Authors: Dipankar Sarma <dipankar@in.ibm.com>
8	* Manfred Spraul <manfred@colorfullife.com>
9	* Paul E. McKenney <paulmck@linux.ibm.com>
10	*
11	* Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
12	* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
13	*
14	* For detailed explanation of Read-Copy Update mechanism see -
15	* Documentation/RCU
16	*/
17
18	#define pr_fmt(fmt) "rcu: " fmt
19
20	#include <linux/types.h>
21	#include <linux/kernel.h>
22	#include <linux/init.h>
23	#include <linux/spinlock.h>
24	#include <linux/smp.h>
25	#include <linux/rcupdate_wait.h>
26	#include <linux/interrupt.h>
27	#include <linux/sched.h>
28	#include <linux/sched/debug.h>
29	#include <linux/nmi.h>
30	#include <linux/atomic.h>
31	#include <linux/bitops.h>
32	#include <linux/export.h>
33	#include <linux/completion.h>
34	#include <linux/kmemleak.h>
35	#include <linux/moduleparam.h>
36	#include <linux/panic.h>
37	#include <linux/panic_notifier.h>
38	#include <linux/percpu.h>
39	#include <linux/notifier.h>
40	#include <linux/cpu.h>
41	#include <linux/mutex.h>
42	#include <linux/time.h>
43	#include <linux/kernel_stat.h>
44	#include <linux/wait.h>
45	#include <linux/kthread.h>
46	#include <uapi/linux/sched/types.h>
47	#include <linux/prefetch.h>
48	#include <linux/delay.h>
49	#include <linux/random.h>
50	#include <linux/trace_events.h>
51	#include <linux/suspend.h>
52	#include <linux/ftrace.h>
53	#include <linux/tick.h>
54	#include <linux/sysrq.h>
55	#include <linux/kprobes.h>
56	#include <linux/gfp.h>
57	#include <linux/oom.h>
58	#include <linux/smpboot.h>
59	#include <linux/jiffies.h>
60	#include <linux/slab.h>
61	#include <linux/sched/isolation.h>
62	#include <linux/sched/clock.h>
63	#include <linux/vmalloc.h>
64	#include <linux/mm.h>
65	#include <linux/kasan.h>
66	#include <linux/context_tracking.h>
67	#include "../time/tick-internal.h"
68
69	#include "tree.h"
70	#include "rcu.h"
71
72	#ifdef MODULE_PARAM_PREFIX
73	#undef MODULE_PARAM_PREFIX
74	#endif
75	#define MODULE_PARAM_PREFIX "rcutree."
76
77	/ Data structures. /
78
79	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
80	.gpwrap = true,
81	#ifdef CONFIG_RCU_NOCB_CPU
82	.cblist.flags = SEGCBLIST_RCU_CORE,
83	#endif
84	};
85	static struct rcu_state rcu_state = {
86	.level = { &rcu_state.node[`0`] },
87	.gp_state = RCU_GP_IDLE,
88	.gp_seq = (`0UL` - `300UL`) << RCU_SEQ_CTR_SHIFT,
89	.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
90	.barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
91	.name = RCU_NAME,
92	.abbr = RCU_ABBR,
93	.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
94	.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
95	.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
96	};
97
98	/ Dump rcu_node combining tree at boot to verify correct setup. /
99	static bool dump_tree;
100	module_param(dump_tree, bool, `0444`);
101	/ By default, use RCU_SOFTIRQ instead of rcuc kthreads. /
102	static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT);
103	#ifndef CONFIG_PREEMPT_RT
104	module_param(use_softirq, bool, `0444`);
105	#endif
106	/ Control rcu_node-tree auto-balancing at boot time. /
107	static bool rcu_fanout_exact;
108	module_param(rcu_fanout_exact, bool, `0444`);
109	/ Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. /
110	static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
111	module_param(rcu_fanout_leaf, int, `0444`);
112	int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
113	/ Number of rcu_nodes at specified level. /
114	int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
115	int rcu_num_nodes __read_mostly = NUM_RCU_NODES; / Total # rcu_nodes in use. /
116
117	/*
118	* The rcu_scheduler_active variable is initialized to the value
119	* RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
120	* first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
121	* RCU can assume that there is but one task, allowing RCU to (for example)
122	* optimize synchronize_rcu() to a simple barrier(). When this variable
123	* is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
124	* to detect real grace periods. This variable is also used to suppress
125	* boot-time false positives from lockdep-RCU error checking. Finally, it
126	* transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
127	* is fully initialized, including all of its kthreads having been spawned.
128	*/
129	int rcu_scheduler_active __read_mostly;
130	EXPORT_SYMBOL_GPL(rcu_scheduler_active);
131
132	/*
133	* The rcu_scheduler_fully_active variable transitions from zero to one
134	* during the early_initcall() processing, which is after the scheduler
135	* is capable of creating new tasks. So RCU processing (for example,
136	* creating tasks for RCU priority boosting) must be delayed until after
137	* rcu_scheduler_fully_active transitions from zero to one. We also
138	* currently delay invocation of any RCU callbacks until after this point.
139	*
140	* It might later prove better for people registering RCU callbacks during
141	* early boot to take responsibility for these callbacks, but one step at
142	* a time.
143	*/
144	static int rcu_scheduler_fully_active __read_mostly;
145
146	static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
147	unsigned long gps, unsigned long flags);
148	static struct task_struct rcu_boost_task(struct* rcu_node *rnp);
149	static void invoke_rcu_core(void);
150	static void rcu_report_exp_rdp(struct rcu_data *rdp);
151	static void sync_sched_exp_online_cleanup(int cpu);
152	static void check_cb_ovld_locked(struct rcu_data rdp, struct* rcu_node *rnp);
153	static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
154	static bool rcu_rdp_cpu_online(struct rcu_data *rdp);
155	static bool rcu_init_invoked(void);
156	static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
157	static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
158
159	/*
160	* rcuc/rcub/rcuop kthread realtime priority. The "rcuop"
161	* real-time priority(enabling/disabling) is controlled by
162	* the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration.
163	*/
164	static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? `1` : `0`;
165	module_param(kthread_prio, int, `0444`);
166
167	/ Delay in jiffies for grace-period initialization delays, debug only. /
168
169	static int gp_preinit_delay;
170	module_param(gp_preinit_delay, int, `0444`);
171	static int gp_init_delay;
172	module_param(gp_init_delay, int, `0444`);
173	static int gp_cleanup_delay;
174	module_param(gp_cleanup_delay, int, `0444`);
175
176	// Add delay to rcu_read_unlock() for strict grace periods.
177	static int rcu_unlock_delay;
178	#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
179	module_param(rcu_unlock_delay, int, `0444`);
180	#endif
181
182	/*
183	* This rcu parameter is runtime-read-only. It reflects
184	* a minimum allowed number of objects which can be cached
185	* per-CPU. Object size is equal to one page. This value
186	* can be changed at boot time.
187	*/
188	static int rcu_min_cached_objs = `5`;
189	module_param(rcu_min_cached_objs, int, `0444`);
190
191	// A page shrinker can ask for pages to be freed to make them
192	// available for other parts of the system. This usually happens
193	// under low memory conditions, and in that case we should also
194	// defer page-cache filling for a short time period.
195	//
196	// The default value is 5 seconds, which is long enough to reduce
197	// interference with the shrinker while it asks other systems to
198	// drain their caches.
199	static int rcu_delay_page_cache_fill_msec = `5000`;
200	module_param(rcu_delay_page_cache_fill_msec, int, `0444`);
201
202	/ Retrieve RCU kthreads priority for rcutorture /
203	int rcu_get_gp_kthreads_prio(void)
204	{
205	return kthread_prio;
206	}
207	EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio);
208
209	/*
210	* Number of grace periods between delays, normalized by the duration of
211	* the delay. The longer the delay, the more the grace periods between
212	* each delay. The reason for this normalization is that it means that,
213	* for non-zero delays, the overall slowdown of grace periods is constant
214	* regardless of the duration of the delay. This arrangement balances
215	* the need for long delays to increase some race probabilities with the
216	* need for fast grace periods to increase other race probabilities.
217	*/
218	#define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */
219
220	/*
221	* Return true if an RCU grace period is in progress. The READ_ONCE()s
222	* permit this function to be invoked without holding the root rcu_node
223	* structure's ->lock, but of course results can be subject to change.
224	*/
225	static int rcu_gp_in_progress(void)
226	{
227	return rcu_seq_state(s: rcu_seq_current(sp: &rcu_state.gp_seq));
228	}
229
230	/*
231	* Return the number of callbacks queued on the specified CPU.
232	* Handles both the nocbs and normal cases.
233	*/
234	static long rcu_get_n_cbs_cpu(int cpu)
235	{
236	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
237
238	if (rcu_segcblist_is_enabled(rsclp: &rdp->cblist))
239	return rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
240	return `0`;
241	}
242
243	void rcu_softirq_qs(void)
244	{
245	rcu_qs();
246	rcu_preempt_deferred_qs(current);
247	rcu_tasks_qs(current, false);
248	}
249
250	/*
251	* Reset the current CPU's ->dynticks counter to indicate that the
252	* newly onlined CPU is no longer in an extended quiescent state.
253	* This will either leave the counter unchanged, or increment it
254	* to the next non-quiescent value.
255	*
256	* The non-atomic test/increment sequence works because the upper bits
257	* of the ->dynticks counter are manipulated only by the corresponding CPU,
258	* or when the corresponding CPU is offline.
259	*/
260	static void rcu_dynticks_eqs_online(void)
261	{
262	if (ct_dynticks() & RCU_DYNTICKS_IDX)
263	return;
264	ct_state_inc(RCU_DYNTICKS_IDX);
265	}
266
267	/*
268	* Snapshot the ->dynticks counter with full ordering so as to allow
269	* stable comparison of this counter with past and future snapshots.
270	*/
271	static int rcu_dynticks_snap(int cpu)
272	{
273	smp_mb(); // Fundamental RCU ordering guarantee.
274	return ct_dynticks_cpu_acquire(cpu);
275	}
276
277	/*
278	* Return true if the snapshot returned from rcu_dynticks_snap()
279	* indicates that RCU is in an extended quiescent state.
280	*/
281	static bool rcu_dynticks_in_eqs(int snap)
282	{
283	return !(snap & RCU_DYNTICKS_IDX);
284	}
285
286	/*
287	* Return true if the CPU corresponding to the specified rcu_data
288	* structure has spent some time in an extended quiescent state since
289	* rcu_dynticks_snap() returned the specified snapshot.
290	*/
291	static bool rcu_dynticks_in_eqs_since(struct rcu_data rdp, int* snap)
292	{
293	return snap != rcu_dynticks_snap(cpu: rdp->cpu);
294	}
295
296	/*
297	* Return true if the referenced integer is zero while the specified
298	* CPU remains within a single extended quiescent state.
299	*/
300	bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
301	{
302	int snap;
303
304	// If not quiescent, force back to earlier extended quiescent state.
305	snap = ct_dynticks_cpu(cpu) & ~RCU_DYNTICKS_IDX;
306	smp_rmb(); // Order ->dynticks and vp reads.*
307	if (READ_ONCE(*vp))
308	return false; // Non-zero, so report failure;
309	smp_rmb(); // Order vp read and ->dynticks re-read.*
310
311	// If still in the same extended quiescent state, we are good!
312	return snap == ct_dynticks_cpu(cpu);
313	}
314
315	/*
316	* Let the RCU core know that this CPU has gone through the scheduler,
317	* which is a quiescent state. This is called when the need for a
318	* quiescent state is urgent, so we burn an atomic operation and full
319	* memory barriers to let the RCU core know about it, regardless of what
320	* this CPU might (or might not) do in the near future.
321	*
322	* We inform the RCU core by emulating a zero-duration dyntick-idle period.
323	*
324	* The caller must have disabled interrupts and must not be idle.
325	*/
326	notrace void rcu_momentary_dyntick_idle(void)
327	{
328	int seq;
329
330	raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
331	seq = ct_state_inc(incby: `2` * RCU_DYNTICKS_IDX);
332	/ It is illegal to call this from idle state. /
333	WARN_ON_ONCE(!(seq & RCU_DYNTICKS_IDX));
334	rcu_preempt_deferred_qs(current);
335	}
336	EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
337
338	/**
339	* rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle
340	*
341	* If the current CPU is idle and running at a first-level (not nested)
342	* interrupt, or directly, from idle, return true.
343	*
344	* The caller must have at least disabled IRQs.
345	*/
346	static int rcu_is_cpu_rrupt_from_idle(void)
347	{
348	long nesting;
349
350	/*
351	* Usually called from the tick; but also used from smp_function_call()
352	* for expedited grace periods. This latter can result in running from
353	* the idle task, instead of an actual IPI.
354	*/
355	lockdep_assert_irqs_disabled();
356
357	/ Check for counter underflows /
358	RCU_LOCKDEP_WARN(ct_dynticks_nesting() < `0`,
359	"RCU dynticks_nesting counter underflow!");
360	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() <= `0`,
361	"RCU dynticks_nmi_nesting counter underflow/zero!");
362
363	/ Are we at first interrupt nesting level? /
364	nesting = ct_dynticks_nmi_nesting();
365	if (nesting > `1`)
366	return false;
367
368	/*
369	* If we're not in an interrupt, we must be in the idle task!
370	*/
371	WARN_ON_ONCE(!nesting && !is_idle_task(current));
372
373	/ Does CPU appear to be idle from an RCU standpoint? /
374	return ct_dynticks_nesting() == `0`;
375	}
376
377	#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
378	// Maximum callbacks per rcu_do_batch ...
379	#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
380	static long blimit = DEFAULT_RCU_BLIMIT;
381	#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
382	static long qhimark = DEFAULT_RCU_QHIMARK;
383	#define DEFAULT_RCU_QLOMARK 100 // Once only this many pending, use blimit.
384	static long qlowmark = DEFAULT_RCU_QLOMARK;
385	#define DEFAULT_RCU_QOVLD_MULT 2
386	#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
387	static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
388	static long qovld_calc = -`1`; // No pre-initialization lock acquisitions!
389
390	module_param(blimit, long, `0444`);
391	module_param(qhimark, long, `0444`);
392	module_param(qlowmark, long, `0444`);
393	module_param(qovld, long, `0444`);
394
395	static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? `0` : ULONG_MAX;
396	static ulong jiffies_till_next_fqs = ULONG_MAX;
397	static bool rcu_kick_kthreads;
398	static int rcu_divisor = `7`;
399	module_param(rcu_divisor, int, `0644`);
400
401	/ Force an exit from rcu_do_batch() after 3 milliseconds. /
402	static long rcu_resched_ns = `3` * NSEC_PER_MSEC;
403	module_param(rcu_resched_ns, long, `0644`);
404
405	/*
406	* How long the grace period must be before we start recruiting
407	* quiescent-state help from rcu_note_context_switch().
408	*/
409	static ulong jiffies_till_sched_qs = ULONG_MAX;
410	module_param(jiffies_till_sched_qs, ulong, `0444`);
411	static ulong jiffies_to_sched_qs; / See adjust_jiffies_till_sched_qs(). /
412	module_param(jiffies_to_sched_qs, ulong, `0444`); / Display only! /
413
414	/*
415	* Make sure that we give the grace-period kthread time to detect any
416	* idle CPUs before taking active measures to force quiescent states.
417	* However, don't go below 100 milliseconds, adjusted upwards for really
418	* large systems.
419	*/
420	static void adjust_jiffies_till_sched_qs(void)
421	{
422	unsigned long j;
423
424	/ If jiffies_till_sched_qs was specified, respect the request. /
425	if (jiffies_till_sched_qs != ULONG_MAX) {
426	WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
427	return;
428	}
429	/ Otherwise, set to third fqs scan, but bound below on large system. /
430	j = READ_ONCE(jiffies_till_first_fqs) +
431	`2` * READ_ONCE(jiffies_till_next_fqs);
432	if (j < HZ / `10` + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
433	j = HZ / `10` + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
434	pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
435	WRITE_ONCE(jiffies_to_sched_qs, j);
436	}
437
438	static int param_set_first_fqs_jiffies(const char val, const* struct kernel_param *kp)
439	{
440	ulong j;
441	int ret = kstrtoul(s: val, base: `0`, res: &j);
442
443	if (!ret) {
444	WRITE_ONCE((ulong )kp->arg, (j > HZ) ? HZ : j);
445	adjust_jiffies_till_sched_qs();
446	}
447	return ret;
448	}
449
450	static int param_set_next_fqs_jiffies(const char val, const* struct kernel_param *kp)
451	{
452	ulong j;
453	int ret = kstrtoul(s: val, base: `0`, res: &j);
454
455	if (!ret) {
456	WRITE_ONCE((ulong )kp->arg, (j > HZ) ? HZ : (j ?: `1`));
457	adjust_jiffies_till_sched_qs();
458	}
459	return ret;
460	}
461
462	static const struct kernel_param_ops first_fqs_jiffies_ops = {
463	.set = param_set_first_fqs_jiffies,
464	.get = param_get_ulong,
465	};
466
467	static const struct kernel_param_ops next_fqs_jiffies_ops = {
468	.set = param_set_next_fqs_jiffies,
469	.get = param_get_ulong,
470	};
471
472	module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, `0644`);
473	module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, `0644`);
474	module_param(rcu_kick_kthreads, bool, `0644`);
475
476	static void force_qs_rnp(int (f)(struct* rcu_data *rdp));
477	static int rcu_pending(int user);
478
479	/*
480	* Return the number of RCU GPs completed thus far for debug & stats.
481	*/
482	unsigned long rcu_get_gp_seq(void)
483	{
484	return READ_ONCE(rcu_state.gp_seq);
485	}
486	EXPORT_SYMBOL_GPL(rcu_get_gp_seq);
487
488	/*
489	* Return the number of RCU expedited batches completed thus far for
490	* debug & stats. Odd numbers mean that a batch is in progress, even
491	* numbers mean idle. The value returned will thus be roughly double
492	* the cumulative batches since boot.
493	*/
494	unsigned long rcu_exp_batches_completed(void)
495	{
496	return rcu_state.expedited_sequence;
497	}
498	EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
499
500	/*
501	* Return the root node of the rcu_state structure.
502	*/
503	static struct rcu_node rcu_get_root(void*)
504	{
505	return &rcu_state.node[`0`];
506	}
507
508	/*
509	* Send along grace-period-related data for rcutorture diagnostics.
510	*/
511	void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
512	unsigned long *gp_seq)
513	{
514	switch (test_type) {
515	case RCU_FLAVOR:
516	*flags = READ_ONCE(rcu_state.gp_flags);
517	*gp_seq = rcu_seq_current(sp: &rcu_state.gp_seq);
518	break;
519	default:
520	break;
521	}
522	}
523	EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
524
525	#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) \|\| !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
526	/*
527	* An empty function that will trigger a reschedule on
528	* IRQ tail once IRQs get re-enabled on userspace/guest resume.
529	*/
530	static void late_wakeup_func(struct irq_work *work)
531	{
532	}
533
534	static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) =
535	IRQ_WORK_INIT(late_wakeup_func);
536
537	/*
538	* If either:
539	*
540	* 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work
541	* 2) the task is about to enter in user mode and $ARCH doesn't support generic entry.
542	*
543	* In these cases the late RCU wake ups aren't supported in the resched loops and our
544	* last resort is to fire a local irq_work that will trigger a reschedule once IRQs
545	* get re-enabled again.
546	*/
547	noinstr void rcu_irq_work_resched(void)
548	{
549	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
550
551	if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU))
552	return;
553
554	if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU))
555	return;
556
557	instrumentation_begin();
558	if (do_nocb_deferred_wakeup(rdp) && need_resched()) {
559	irq_work_queue(this_cpu_ptr(&late_wakeup_work));
560	}
561	instrumentation_end();
562	}
563	#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) \|\| !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */
564
565	#ifdef CONFIG_PROVE_RCU
566	/**
567	* rcu_irq_exit_check_preempt - Validate that scheduling is possible
568	*/
569	void rcu_irq_exit_check_preempt(void)
570	{
571	lockdep_assert_irqs_disabled();
572
573	RCU_LOCKDEP_WARN(ct_dynticks_nesting() <= `0`,
574	"RCU dynticks_nesting counter underflow/zero!");
575	RCU_LOCKDEP_WARN(ct_dynticks_nmi_nesting() !=
576	DYNTICK_IRQ_NONIDLE,
577	"Bad RCU dynticks_nmi_nesting counter\n");
578	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
579	"RCU in extended quiescent state!");
580	}
581	#endif /* #ifdef CONFIG_PROVE_RCU */
582
583	#ifdef CONFIG_NO_HZ_FULL
584	/**
585	* __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it.
586	*
587	* The scheduler tick is not normally enabled when CPUs enter the kernel
588	* from nohz_full userspace execution. After all, nohz_full userspace
589	* execution is an RCU quiescent state and the time executing in the kernel
590	* is quite short. Except of course when it isn't. And it is not hard to
591	* cause a large system to spend tens of seconds or even minutes looping
592	* in the kernel, which can cause a number of problems, include RCU CPU
593	* stall warnings.
594	*
595	* Therefore, if a nohz_full CPU fails to report a quiescent state
596	* in a timely manner, the RCU grace-period kthread sets that CPU's
597	* ->rcu_urgent_qs flag with the expectation that the next interrupt or
598	* exception will invoke this function, which will turn on the scheduler
599	* tick, which will enable RCU to detect that CPU's quiescent states,
600	* for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels.
601	* The tick will be disabled once a quiescent state is reported for
602	* this CPU.
603	*
604	* Of course, in carefully tuned systems, there might never be an
605	* interrupt or exception. In that case, the RCU grace-period kthread
606	* will eventually cause one to happen. However, in less carefully
607	* controlled environments, this function allows RCU to get what it
608	* needs without creating otherwise useless interruptions.
609	*/
610	void __rcu_irq_enter_check_tick(void)
611	{
612	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
613
614	// If we're here from NMI there's nothing to do.
615	if (in_nmi())
616	return;
617
618	RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
619	"Illegal rcu_irq_enter_check_tick() from extended quiescent state");
620
621	if (!tick_nohz_full_cpu(rdp->cpu) \|\|
622	!READ_ONCE(rdp->rcu_urgent_qs) \|\|
623	READ_ONCE(rdp->rcu_forced_tick)) {
624	// RCU doesn't need nohz_full help from this CPU, or it is
625	// already getting that help.
626	return;
627	}
628
629	// We get here only when not in an extended quiescent state and
630	// from interrupts (as opposed to NMIs). Therefore, (1) RCU is
631	// already watching and (2) The fact that we are in an interrupt
632	// handler and that the rcu_node lock is an irq-disabled lock
633	// prevents self-deadlock. So we can safely recheck under the lock.
634	// Note that the nohz_full state currently cannot change.
635	raw_spin_lock_rcu_node(rdp->mynode);
636	if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
637	// A nohz_full CPU is in the kernel and RCU needs a
638	// quiescent state. Turn on the tick!
639	WRITE_ONCE(rdp->rcu_forced_tick, true);
640	tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
641	}
642	raw_spin_unlock_rcu_node(rdp->mynode);
643	}
644	NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
645	#endif /* CONFIG_NO_HZ_FULL */
646
647	/*
648	* Check to see if any future non-offloaded RCU-related work will need
649	* to be done by the current CPU, even if none need be done immediately,
650	* returning 1 if so. This function is part of the RCU implementation;
651	* it is -not- an exported member of the RCU API. This is used by
652	* the idle-entry code to figure out whether it is safe to disable the
653	* scheduler-clock interrupt.
654	*
655	* Just check whether or not this CPU has non-offloaded RCU callbacks
656	* queued.
657	*/
658	int rcu_needs_cpu(void)
659	{
660	return !rcu_segcblist_empty(rsclp: &this_cpu_ptr(&rcu_data)->cblist) &&
661	!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
662	}
663
664	/*
665	* If any sort of urgency was applied to the current CPU (for example,
666	* the scheduler-clock interrupt was enabled on a nohz_full CPU) in order
667	* to get to a quiescent state, disable it.
668	*/
669	static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
670	{
671	raw_lockdep_assert_held_rcu_node(rdp->mynode);
672	WRITE_ONCE(rdp->rcu_urgent_qs, false);
673	WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
674	if (tick_nohz_full_cpu(cpu: rdp->cpu) && rdp->rcu_forced_tick) {
675	tick_dep_clear_cpu(cpu: rdp->cpu, bit: TICK_DEP_BIT_RCU);
676	WRITE_ONCE(rdp->rcu_forced_tick, false);
677	}
678	}
679
680	/**
681	* rcu_is_watching - RCU read-side critical sections permitted on current CPU?
682	*
683	* Return @true if RCU is watching the running CPU and @false otherwise.
684	* An @true return means that this CPU can safely enter RCU read-side
685	* critical sections.
686	*
687	* Although calls to rcu_is_watching() from most parts of the kernel
688	* will return @true, there are important exceptions. For example, if the
689	* current CPU is deep within its idle loop, in kernel entry/exit code,
690	* or offline, rcu_is_watching() will return @false.
691	*
692	* Make notrace because it can be called by the internal functions of
693	* ftrace, and making this notrace removes unnecessary recursion calls.
694	*/
695	notrace bool rcu_is_watching(void)
696	{
697	bool ret;
698
699	preempt_disable_notrace();
700	ret = !rcu_dynticks_curr_cpu_in_eqs();
701	preempt_enable_notrace();
702	return ret;
703	}
704	EXPORT_SYMBOL_GPL(rcu_is_watching);
705
706	/*
707	* If a holdout task is actually running, request an urgent quiescent
708	* state from its CPU. This is unsynchronized, so migrations can cause
709	* the request to go to the wrong CPU. Which is OK, all that will happen
710	* is that the CPU's next context switch will be a bit slower and next
711	* time around this task will generate another request.
712	*/
713	void rcu_request_urgent_qs_task(struct task_struct *t)
714	{
715	int cpu;
716
717	barrier();
718	cpu = task_cpu(p: t);
719	if (!task_curr(p: t))
720	return; / This task is not running on that CPU. /
721	smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true);
722	}
723
724	/*
725	* When trying to report a quiescent state on behalf of some other CPU,
726	* it is our responsibility to check for and handle potential overflow
727	* of the rcu_node ->gp_seq counter with respect to the rcu_data counters.
728	* After all, the CPU might be in deep idle state, and thus executing no
729	* code whatsoever.
730	*/
731	static void rcu_gpnum_ovf(struct rcu_node rnp, struct* rcu_data *rdp)
732	{
733	raw_lockdep_assert_held_rcu_node(rnp);
734	if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / `4`,
735	rnp->gp_seq))
736	WRITE_ONCE(rdp->gpwrap, true);
737	if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / `4`, rnp->gp_seq))
738	rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / `4`;
739	}
740
741	/*
742	* Snapshot the specified CPU's dynticks counter so that we can later
743	* credit them with an implicit quiescent state. Return 1 if this CPU
744	* is in dynticks idle mode, which is an extended quiescent state.
745	*/
746	static int dyntick_save_progress_counter(struct rcu_data *rdp)
747	{
748	rdp->dynticks_snap = rcu_dynticks_snap(cpu: rdp->cpu);
749	if (rcu_dynticks_in_eqs(snap: rdp->dynticks_snap)) {
750	trace_rcu_fqs(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, cpu: rdp->cpu, TPS("dti"));
751	rcu_gpnum_ovf(rnp: rdp->mynode, rdp);
752	return `1`;
753	}
754	return `0`;
755	}
756
757	/*
758	* Returns positive if the specified CPU has passed through a quiescent state
759	* by virtue of being in or having passed through an dynticks idle state since
760	* the last call to dyntick_save_progress_counter() for this same CPU, or by
761	* virtue of having been offline.
762	*
763	* Returns negative if the specified CPU needs a force resched.
764	*
765	* Returns zero otherwise.
766	*/
767	static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
768	{
769	unsigned long jtsq;
770	int ret = `0`;
771	struct rcu_node *rnp = rdp->mynode;
772
773	/*
774	* If the CPU passed through or entered a dynticks idle phase with
775	* no active irq/NMI handlers, then we can safely pretend that the CPU
776	* already acknowledged the request to pass through a quiescent
777	* state. Either way, that CPU cannot possibly be in an RCU
778	* read-side critical section that started before the beginning
779	* of the current RCU grace period.
780	*/
781	if (rcu_dynticks_in_eqs_since(rdp, snap: rdp->dynticks_snap)) {
782	trace_rcu_fqs(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, cpu: rdp->cpu, TPS("dti"));
783	rcu_gpnum_ovf(rnp, rdp);
784	return `1`;
785	}
786
787	/*
788	* Complain if a CPU that is considered to be offline from RCU's
789	* perspective has not yet reported a quiescent state. After all,
790	* the offline CPU should have reported a quiescent state during
791	* the CPU-offline process, or, failing that, by rcu_gp_init()
792	* if it ran concurrently with either the CPU going offline or the
793	* last task on a leaf rcu_node structure exiting its RCU read-side
794	* critical section while all CPUs corresponding to that structure
795	* are offline. This added warning detects bugs in any of these
796	* code paths.
797	*
798	* The rcu_node structure's ->lock is held here, which excludes
799	* the relevant portions the CPU-hotplug code, the grace-period
800	* initialization code, and the rcu_read_unlock() code paths.
801	*
802	* For more detail, please refer to the "Hotplug CPU" section
803	* of RCU's Requirements documentation.
804	*/
805	if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
806	struct rcu_node *rnp1;
807
808	pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
809	__func__, rnp->grplo, rnp->grphi, rnp->level,
810	(long)rnp->gp_seq, (long)rnp->completedqs);
811	for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
812	pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
813	__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
814	pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
815	__func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
816	(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
817	(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
818	return `1`; / Break things loose after complaining. /
819	}
820
821	/*
822	* A CPU running for an extended time within the kernel can
823	* delay RCU grace periods: (1) At age jiffies_to_sched_qs,
824	* set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set
825	* both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the
826	* unsynchronized assignments to the per-CPU rcu_need_heavy_qs
827	* variable are safe because the assignments are repeated if this
828	* CPU failed to pass through a quiescent state. This code
829	* also checks .jiffies_resched in case jiffies_to_sched_qs
830	* is set way high.
831	*/
832	jtsq = READ_ONCE(jiffies_to_sched_qs);
833	if (!READ_ONCE(rdp->rcu_need_heavy_qs) &&
834	(time_after(jiffies, rcu_state.gp_start + jtsq * `2`) \|\|
835	time_after(jiffies, rcu_state.jiffies_resched) \|\|
836	rcu_state.cbovld)) {
837	WRITE_ONCE(rdp->rcu_need_heavy_qs, true);
838	/ Store rcu_need_heavy_qs before rcu_urgent_qs. /
839	smp_store_release(&rdp->rcu_urgent_qs, true);
840	} else if (time_after(jiffies, rcu_state.gp_start + jtsq)) {
841	WRITE_ONCE(rdp->rcu_urgent_qs, true);
842	}
843
844	/*
845	* NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq!
846	* The above code handles this, but only for straight cond_resched().
847	* And some in-kernel loops check need_resched() before calling
848	* cond_resched(), which defeats the above code for CPUs that are
849	* running in-kernel with scheduling-clock interrupts disabled.
850	* So hit them over the head with the resched_cpu() hammer!
851	*/
852	if (tick_nohz_full_cpu(cpu: rdp->cpu) &&
853	(time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * `3`) \|\|
854	rcu_state.cbovld)) {
855	WRITE_ONCE(rdp->rcu_urgent_qs, true);
856	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
857	ret = -`1`;
858	}
859
860	/*
861	* If more than halfway to RCU CPU stall-warning time, invoke
862	* resched_cpu() more frequently to try to loosen things up a bit.
863	* Also check to see if the CPU is getting hammered with interrupts,
864	* but only once per grace period, just to keep the IPIs down to
865	* a dull roar.
866	*/
867	if (time_after(jiffies, rcu_state.jiffies_resched)) {
868	if (time_after(jiffies,
869	READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
870	WRITE_ONCE(rdp->last_fqs_resched, jiffies);
871	ret = -`1`;
872	}
873	if (IS_ENABLED(CONFIG_IRQ_WORK) &&
874	!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
875	(rnp->ffmask & rdp->grpmask)) {
876	rdp->rcu_iw_pending = true;
877	rdp->rcu_iw_gp_seq = rnp->gp_seq;
878	irq_work_queue_on(work: &rdp->rcu_iw, cpu: rdp->cpu);
879	}
880
881	if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) {
882	int cpu = rdp->cpu;
883	struct rcu_snap_record *rsrp;
884	struct kernel_cpustat *kcsp;
885
886	kcsp = &kcpustat_cpu(cpu);
887
888	rsrp = &rdp->snap_record;
889	rsrp->cputime_irq = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_IRQ, cpu);
890	rsrp->cputime_softirq = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_SOFTIRQ, cpu);
891	rsrp->cputime_system = kcpustat_field(kcpustat: kcsp, usage: CPUTIME_SYSTEM, cpu);
892	rsrp->nr_hardirqs = kstat_cpu_irqs_sum(cpu: rdp->cpu);
893	rsrp->nr_softirqs = kstat_cpu_softirqs_sum(cpu: rdp->cpu);
894	rsrp->nr_csw = nr_context_switches_cpu(cpu: rdp->cpu);
895	rsrp->jiffies = jiffies;
896	rsrp->gp_seq = rdp->gp_seq;
897	}
898	}
899
900	return ret;
901	}
902
903	/ Trace-event wrapper function for trace_rcu_future_grace_period. /
904	static void trace_rcu_this_gp(struct rcu_node rnp, struct* rcu_data *rdp,
905	unsigned long gp_seq_req, const char *s)
906	{
907	trace_rcu_future_grace_period(rcuname: rcu_state.name, READ_ONCE(rnp->gp_seq),
908	gp_seq_req, level: rnp->level,
909	grplo: rnp->grplo, grphi: rnp->grphi, gpevent: s);
910	}
911
912	/*
913	* rcu_start_this_gp - Request the start of a particular grace period
914	* @rnp_start: The leaf node of the CPU from which to start.
915	* @rdp: The rcu_data corresponding to the CPU from which to start.
916	* @gp_seq_req: The gp_seq of the grace period to start.
917	*
918	* Start the specified grace period, as needed to handle newly arrived
919	* callbacks. The required future grace periods are recorded in each
920	* rcu_node structure's ->gp_seq_needed field. Returns true if there
921	* is reason to awaken the grace-period kthread.
922	*
923	* The caller must hold the specified rcu_node structure's ->lock, which
924	* is why the caller is responsible for waking the grace-period kthread.
925	*
926	* Returns true if the GP thread needs to be awakened else false.
927	*/
928	static bool rcu_start_this_gp(struct rcu_node rnp_start, struct* rcu_data *rdp,
929	unsigned long gp_seq_req)
930	{
931	bool ret = false;
932	struct rcu_node *rnp;
933
934	/*
935	* Use funnel locking to either acquire the root rcu_node
936	* structure's lock or bail out if the need for this grace period
937	* has already been recorded -- or if that grace period has in
938	* fact already started. If there is already a grace period in
939	* progress in a non-leaf node, no recording is needed because the
940	* end of the grace period will scan the leaf rcu_node structures.
941	* Note that rnp_start->lock must not be released.
942	*/
943	raw_lockdep_assert_held_rcu_node(rnp_start);
944	trace_rcu_this_gp(rnp: rnp_start, rdp, gp_seq_req, TPS("Startleaf"));
945	for (rnp = rnp_start; `1`; rnp = rnp->parent) {
946	if (rnp != rnp_start)
947	raw_spin_lock_rcu_node(rnp);
948	if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) \|\|
949	rcu_seq_started(sp: &rnp->gp_seq, s: gp_seq_req) \|\|
950	(rnp != rnp_start &&
951	rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)))) {
952	trace_rcu_this_gp(rnp, rdp, gp_seq_req,
953	TPS("Prestarted"));
954	goto unlock_out;
955	}
956	WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
957	if (rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq))) {
958	/*
959	* We just marked the leaf or internal node, and a
960	* grace period is in progress, which means that
961	* rcu_gp_cleanup() will see the marking. Bail to
962	* reduce contention.
963	*/
964	trace_rcu_this_gp(rnp: rnp_start, rdp, gp_seq_req,
965	TPS("Startedleaf"));
966	goto unlock_out;
967	}
968	if (rnp != rnp_start && rnp->parent != NULL)
969	raw_spin_unlock_rcu_node(rnp);
970	if (!rnp->parent)
971	break; / At root, and perhaps also leaf. /
972	}
973
974	/ If GP already in progress, just leave, otherwise start one. /
975	if (rcu_gp_in_progress()) {
976	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot"));
977	goto unlock_out;
978	}
979	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
980	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags \| RCU_GP_FLAG_INIT);
981	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
982	if (!READ_ONCE(rcu_state.gp_kthread)) {
983	trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
984	goto unlock_out;
985	}
986	trace_rcu_grace_period(rcuname: rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq"));
987	ret = true; / Caller must wake GP kthread. /
988	unlock_out:
989	/ Push furthest requested GP to leaf node and rcu_data structure. /
990	if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
991	WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
992	WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
993	}
994	if (rnp != rnp_start)
995	raw_spin_unlock_rcu_node(rnp);
996	return ret;
997	}
998
999	/*
1000	* Clean up any old requests for the just-ended grace period. Also return
1001	* whether any additional grace periods have been requested.
1002	*/
1003	static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
1004	{
1005	bool needmore;
1006	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1007
1008	needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed);
1009	if (!needmore)
1010	rnp->gp_seq_needed = rnp->gp_seq; / Avoid counter wrap. /
1011	trace_rcu_this_gp(rnp, rdp, gp_seq_req: rnp->gp_seq,
1012	s: needmore ? TPS("CleanupMore") : TPS("Cleanup"));
1013	return needmore;
1014	}
1015
1016	static void swake_up_one_online_ipi(void *arg)
1017	{
1018	struct swait_queue_head *wqh = arg;
1019
1020	swake_up_one(q: wqh);
1021	}
1022
1023	static void swake_up_one_online(struct swait_queue_head *wqh)
1024	{
1025	int cpu = get_cpu();
1026
1027	/*
1028	* If called from rcutree_report_cpu_starting(), wake up
1029	* is dangerous that late in the CPU-down hotplug process. The
1030	* scheduler might queue an ignored hrtimer. Defer the wake up
1031	* to an online CPU instead.
1032	*/
1033	if (unlikely(cpu_is_offline(cpu))) {
1034	int target;
1035
1036	target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
1037	cpu_online_mask);
1038
1039	smp_call_function_single(cpuid: target, func: swake_up_one_online_ipi,
1040	info: wqh, wait: `0`);
1041	put_cpu();
1042	} else {
1043	put_cpu();
1044	swake_up_one(q: wqh);
1045	}
1046	}
1047
1048	/*
1049	* Awaken the grace-period kthread. Don't do a self-awaken (unless in an
1050	* interrupt or softirq handler, in which case we just might immediately
1051	* sleep upon return, resulting in a grace-period hang), and don't bother
1052	* awakening when there is nothing for the grace-period kthread to do
1053	* (as in several CPUs raced to awaken, we lost), and finally don't try
1054	* to awaken a kthread that has not yet been created. If all those checks
1055	* are passed, track some debug information and awaken.
1056	*
1057	* So why do the self-wakeup when in an interrupt or softirq handler
1058	* in the grace-period kthread's context? Because the kthread might have
1059	* been interrupted just as it was going to sleep, and just after the final
1060	* pre-sleep check of the awaken condition. In this case, a wakeup really
1061	* is required, and is therefore supplied.
1062	*/
1063	static void rcu_gp_kthread_wake(void)
1064	{
1065	struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
1066
1067	if ((current == t && !in_hardirq() && !in_serving_softirq()) \|\|
1068	!READ_ONCE(rcu_state.gp_flags) \|\| !t)
1069	return;
1070	WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
1071	WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
1072	swake_up_one_online(wqh: &rcu_state.gp_wq);
1073	}
1074
1075	/*
1076	* If there is room, assign a ->gp_seq number to any callbacks on this
1077	* CPU that have not already been assigned. Also accelerate any callbacks
1078	* that were previously assigned a ->gp_seq number that has since proven
1079	* to be too conservative, which can happen if callbacks get assigned a
1080	* ->gp_seq number while RCU is idle, but with reference to a non-root
1081	* rcu_node structure. This function is idempotent, so it does not hurt
1082	* to call it repeatedly. Returns an flag saying that we should awaken
1083	* the RCU grace-period kthread.
1084	*
1085	* The caller must hold rnp->lock with interrupts disabled.
1086	*/
1087	static bool rcu_accelerate_cbs(struct rcu_node rnp, struct* rcu_data *rdp)
1088	{
1089	unsigned long gp_seq_req;
1090	bool ret = false;
1091
1092	rcu_lockdep_assert_cblist_protected(rdp);
1093	raw_lockdep_assert_held_rcu_node(rnp);
1094
1095	/ If no pending (not yet ready to invoke) callbacks, nothing to do. /
1096	if (!rcu_segcblist_pend_cbs(rsclp: &rdp->cblist))
1097	return false;
1098
1099	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbPreAcc"));
1100
1101	/*
1102	* Callbacks are often registered with incomplete grace-period
1103	* information. Something about the fact that getting exact
1104	* information requires acquiring a global lock... RCU therefore
1105	* makes a conservative estimate of the grace period number at which
1106	* a given callback will become ready to invoke. The following
1107	* code checks this estimate and improves it when possible, thus
1108	* accelerating callback invocation to an earlier grace-period
1109	* number.
1110	*/
1111	gp_seq_req = rcu_seq_snap(sp: &rcu_state.gp_seq);
1112	if (rcu_segcblist_accelerate(rsclp: &rdp->cblist, seq: gp_seq_req))
1113	ret = rcu_start_this_gp(rnp_start: rnp, rdp, gp_seq_req);
1114
1115	/ Trace depending on how much we were able to accelerate. /
1116	if (rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_WAIT_TAIL))
1117	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: gp_seq_req, TPS("AccWaitCB"));
1118	else
1119	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: gp_seq_req, TPS("AccReadyCB"));
1120
1121	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbPostAcc"));
1122
1123	return ret;
1124	}
1125
1126	/*
1127	* Similar to rcu_accelerate_cbs(), but does not require that the leaf
1128	* rcu_node structure's ->lock be held. It consults the cached value
1129	* of ->gp_seq_needed in the rcu_data structure, and if that indicates
1130	* that a new grace-period request be made, invokes rcu_accelerate_cbs()
1131	* while holding the leaf rcu_node structure's ->lock.
1132	*/
1133	static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1134	struct rcu_data *rdp)
1135	{
1136	unsigned long c;
1137	bool needwake;
1138
1139	rcu_lockdep_assert_cblist_protected(rdp);
1140	c = rcu_seq_snap(sp: &rcu_state.gp_seq);
1141	if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1142	/ Old request still live, so mark recent callbacks. /
1143	(void)rcu_segcblist_accelerate(rsclp: &rdp->cblist, seq: c);
1144	return;
1145	}
1146	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
1147	needwake = rcu_accelerate_cbs(rnp, rdp);
1148	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
1149	if (needwake)
1150	rcu_gp_kthread_wake();
1151	}
1152
1153	/*
1154	* Move any callbacks whose grace period has completed to the
1155	* RCU_DONE_TAIL sublist, then compact the remaining sublists and
1156	* assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL
1157	* sublist. This function is idempotent, so it does not hurt to
1158	* invoke it repeatedly. As long as it is not invoked -too- often...
1159	* Returns true if the RCU grace-period kthread needs to be awakened.
1160	*
1161	* The caller must hold rnp->lock with interrupts disabled.
1162	*/
1163	static bool rcu_advance_cbs(struct rcu_node rnp, struct* rcu_data *rdp)
1164	{
1165	rcu_lockdep_assert_cblist_protected(rdp);
1166	raw_lockdep_assert_held_rcu_node(rnp);
1167
1168	/ If no pending (not yet ready to invoke) callbacks, nothing to do. /
1169	if (!rcu_segcblist_pend_cbs(rsclp: &rdp->cblist))
1170	return false;
1171
1172	/*
1173	* Find all callbacks whose ->gp_seq numbers indicate that they
1174	* are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
1175	*/
1176	rcu_segcblist_advance(rsclp: &rdp->cblist, seq: rnp->gp_seq);
1177
1178	/ Classify any remaining callbacks. /
1179	return rcu_accelerate_cbs(rnp, rdp);
1180	}
1181
1182	/*
1183	* Move and classify callbacks, but only if doing so won't require
1184	* that the RCU grace-period kthread be awakened.
1185	*/
1186	static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1187	struct rcu_data *rdp)
1188	{
1189	rcu_lockdep_assert_cblist_protected(rdp);
1190	if (!rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)) \|\| !raw_spin_trylock_rcu_node(rnp))
1191	return;
1192	// The grace period cannot end while we hold the rcu_node lock.
1193	if (rcu_seq_state(s: rcu_seq_current(sp: &rnp->gp_seq)))
1194	WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1195	raw_spin_unlock_rcu_node(rnp);
1196	}
1197
1198	/*
1199	* In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
1200	* quiescent state. This is intended to be invoked when the CPU notices
1201	* a new grace period.
1202	*/
1203	static void rcu_strict_gp_check_qs(void)
1204	{
1205	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
1206	rcu_read_lock();
1207	rcu_read_unlock();
1208	}
1209	}
1210
1211	/*
1212	* Update CPU-local rcu_data state to record the beginnings and ends of
1213	* grace periods. The caller must hold the ->lock of the leaf rcu_node
1214	* structure corresponding to the current CPU, and must have irqs disabled.
1215	* Returns true if the grace-period kthread needs to be awakened.
1216	*/
1217	static bool __note_gp_changes(struct rcu_node rnp, struct* rcu_data *rdp)
1218	{
1219	bool ret = false;
1220	bool need_qs;
1221	const bool offloaded = rcu_rdp_is_offloaded(rdp);
1222
1223	raw_lockdep_assert_held_rcu_node(rnp);
1224
1225	if (rdp->gp_seq == rnp->gp_seq)
1226	return false; / Nothing to do. /
1227
1228	/ Handle the ends of any preceding grace periods first. /
1229	if (rcu_seq_completed_gp(old: rdp->gp_seq, new: rnp->gp_seq) \|\|
1230	unlikely(READ_ONCE(rdp->gpwrap))) {
1231	if (!offloaded)
1232	ret = rcu_advance_cbs(rnp, rdp); / Advance CBs. /
1233	rdp->core_needs_qs = false;
1234	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, TPS("cpuend"));
1235	} else {
1236	if (!offloaded)
1237	ret = rcu_accelerate_cbs(rnp, rdp); / Recent CBs. /
1238	if (rdp->core_needs_qs)
1239	rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
1240	}
1241
1242	/ Now handle the beginnings of any new-to-this-CPU grace periods. /
1243	if (rcu_seq_new_gp(old: rdp->gp_seq, new: rnp->gp_seq) \|\|
1244	unlikely(READ_ONCE(rdp->gpwrap))) {
1245	/*
1246	* If the current grace period is waiting for this CPU,
1247	* set up to detect a quiescent state, otherwise don't
1248	* go looking for one.
1249	*/
1250	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rnp->gp_seq, TPS("cpustart"));
1251	need_qs = !!(rnp->qsmask & rdp->grpmask);
1252	rdp->cpu_no_qs.b.norm = need_qs;
1253	rdp->core_needs_qs = need_qs;
1254	zero_cpu_stall_ticks(rdp);
1255	}
1256	rdp->gp_seq = rnp->gp_seq; / Remember new grace-period state. /
1257	if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) \|\| rdp->gpwrap)
1258	WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
1259	if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
1260	WRITE_ONCE(rdp->last_sched_clock, jiffies);
1261	WRITE_ONCE(rdp->gpwrap, false);
1262	rcu_gpnum_ovf(rnp, rdp);
1263	return ret;
1264	}
1265
1266	static void note_gp_changes(struct rcu_data *rdp)
1267	{
1268	unsigned long flags;
1269	bool needwake;
1270	struct rcu_node *rnp;
1271
1272	local_irq_save(flags);
1273	rnp = rdp->mynode;
1274	if ((rdp->gp_seq == rcu_seq_current(sp: &rnp->gp_seq) &&
1275	!unlikely(READ_ONCE(rdp->gpwrap))) \|\| / w/out lock. /
1276	!raw_spin_trylock_rcu_node(rnp)) { / irqs already off, so later. /
1277	local_irq_restore(flags);
1278	return;
1279	}
1280	needwake = __note_gp_changes(rnp, rdp);
1281	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1282	rcu_strict_gp_check_qs();
1283	if (needwake)
1284	rcu_gp_kthread_wake();
1285	}
1286
1287	static atomic_t *rcu_gp_slow_suppress;
1288
1289	/ Register a counter to suppress debugging grace-period delays. /
1290	void rcu_gp_slow_register(atomic_t *rgssp)
1291	{
1292	WARN_ON_ONCE(rcu_gp_slow_suppress);
1293
1294	WRITE_ONCE(rcu_gp_slow_suppress, rgssp);
1295	}
1296	EXPORT_SYMBOL_GPL(rcu_gp_slow_register);
1297
1298	/ Unregister a counter, with NULL for not caring which. /
1299	void rcu_gp_slow_unregister(atomic_t *rgssp)
1300	{
1301	WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL);
1302
1303	WRITE_ONCE(rcu_gp_slow_suppress, NULL);
1304	}
1305	EXPORT_SYMBOL_GPL(rcu_gp_slow_unregister);
1306
1307	static bool rcu_gp_slow_is_suppressed(void)
1308	{
1309	atomic_t *rgssp = READ_ONCE(rcu_gp_slow_suppress);
1310
1311	return rgssp && atomic_read(v: rgssp);
1312	}
1313
1314	static void rcu_gp_slow(int delay)
1315	{
1316	if (!rcu_gp_slow_is_suppressed() && delay > `0` &&
1317	!(rcu_seq_ctr(s: rcu_state.gp_seq) % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
1318	schedule_timeout_idle(timeout: delay);
1319	}
1320
1321	static unsigned long sleep_duration;
1322
1323	/ Allow rcutorture to stall the grace-period kthread. /
1324	void rcu_gp_set_torture_wait(int duration)
1325	{
1326	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > `0`)
1327	WRITE_ONCE(sleep_duration, duration);
1328	}
1329	EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait);
1330
1331	/ Actually implement the aforementioned wait. /
1332	static void rcu_gp_torture_wait(void)
1333	{
1334	unsigned long duration;
1335
1336	if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST))
1337	return;
1338	duration = xchg(&sleep_duration, `0UL`);
1339	if (duration > `0`) {
1340	pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
1341	schedule_timeout_idle(timeout: duration);
1342	pr_alert("%s: Wait complete\n", __func__);
1343	}
1344	}
1345
1346	/*
1347	* Handler for on_each_cpu() to invoke the target CPU's RCU core
1348	* processing.
1349	*/
1350	static void rcu_strict_gp_boundary(void *unused)
1351	{
1352	invoke_rcu_core();
1353	}
1354
1355	// Make the polled API aware of the beginning of a grace period.
1356	static void rcu_poll_gp_seq_start(unsigned long *snap)
1357	{
1358	struct rcu_node *rnp = rcu_get_root();
1359
1360	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1361	raw_lockdep_assert_held_rcu_node(rnp);
1362
1363	// If RCU was idle, note beginning of GP.
1364	if (!rcu_seq_state(s: rcu_state.gp_seq_polled))
1365	rcu_seq_start(sp: &rcu_state.gp_seq_polled);
1366
1367	// Either way, record current state.
1368	*snap = rcu_state.gp_seq_polled;
1369	}
1370
1371	// Make the polled API aware of the end of a grace period.
1372	static void rcu_poll_gp_seq_end(unsigned long *snap)
1373	{
1374	struct rcu_node *rnp = rcu_get_root();
1375
1376	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1377	raw_lockdep_assert_held_rcu_node(rnp);
1378
1379	// If the previously noted GP is still in effect, record the
1380	// end of that GP. Either way, zero counter to avoid counter-wrap
1381	// problems.
1382	if (snap && snap == rcu_state.gp_seq_polled) {
1383	rcu_seq_end(sp: &rcu_state.gp_seq_polled);
1384	rcu_state.gp_seq_polled_snap = `0`;
1385	rcu_state.gp_seq_polled_exp_snap = `0`;
1386	} else {
1387	*snap = `0`;
1388	}
1389	}
1390
1391	// Make the polled API aware of the beginning of a grace period, but
1392	// where caller does not hold the root rcu_node structure's lock.
1393	static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
1394	{
1395	unsigned long flags;
1396	struct rcu_node *rnp = rcu_get_root();
1397
1398	if (rcu_init_invoked()) {
1399	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1400	lockdep_assert_irqs_enabled();
1401	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1402	}
1403	rcu_poll_gp_seq_start(snap);
1404	if (rcu_init_invoked())
1405	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1406	}
1407
1408	// Make the polled API aware of the end of a grace period, but where
1409	// caller does not hold the root rcu_node structure's lock.
1410	static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
1411	{
1412	unsigned long flags;
1413	struct rcu_node *rnp = rcu_get_root();
1414
1415	if (rcu_init_invoked()) {
1416	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
1417	lockdep_assert_irqs_enabled();
1418	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1419	}
1420	rcu_poll_gp_seq_end(snap);
1421	if (rcu_init_invoked())
1422	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1423	}
1424
1425	/*
1426	* Initialize a new grace period. Return false if no grace period required.
1427	*/
1428	static noinline_for_stack bool rcu_gp_init(void)
1429	{
1430	unsigned long flags;
1431	unsigned long oldmask;
1432	unsigned long mask;
1433	struct rcu_data *rdp;
1434	struct rcu_node *rnp = rcu_get_root();
1435
1436	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1437	raw_spin_lock_irq_rcu_node(rnp);
1438	if (!READ_ONCE(rcu_state.gp_flags)) {
1439	/ Spurious wakeup, tell caller to go back to sleep. /
1440	raw_spin_unlock_irq_rcu_node(rnp);
1441	return false;
1442	}
1443	WRITE_ONCE(rcu_state.gp_flags, `0`); / Clear all flags: New GP. /
1444
1445	if (WARN_ON_ONCE(rcu_gp_in_progress())) {
1446	/*
1447	* Grace period already in progress, don't start another.
1448	* Not supposed to be able to happen.
1449	*/
1450	raw_spin_unlock_irq_rcu_node(rnp);
1451	return false;
1452	}
1453
1454	/ Advance to a new grace period and initialize state. /
1455	record_gp_stall_check_time();
1456	/ Record GP times before starting GP, hence rcu_seq_start(). /
1457	rcu_seq_start(sp: &rcu_state.gp_seq);
1458	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
1459	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("start"));
1460	rcu_poll_gp_seq_start(snap: &rcu_state.gp_seq_polled_snap);
1461	raw_spin_unlock_irq_rcu_node(rnp);
1462
1463	/*
1464	* Apply per-leaf buffered online and offline operations to
1465	* the rcu_node tree. Note that this new grace period need not
1466	* wait for subsequent online CPUs, and that RCU hooks in the CPU
1467	* offlining path, when combined with checks in this function,
1468	* will handle CPUs that are currently going offline or that will
1469	* go offline later. Please also refer to "Hotplug CPU" section
1470	* of RCU's Requirements documentation.
1471	*/
1472	WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
1473	/ Exclude CPU hotplug operations. /
1474	rcu_for_each_leaf_node(rnp) {
1475	local_irq_save(flags);
1476	arch_spin_lock(&rcu_state.ofl_lock);
1477	raw_spin_lock_rcu_node(rnp);
1478	if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1479	!rnp->wait_blkd_tasks) {
1480	/ Nothing to do on this leaf rcu_node structure. /
1481	raw_spin_unlock_rcu_node(rnp);
1482	arch_spin_unlock(&rcu_state.ofl_lock);
1483	local_irq_restore(flags);
1484	continue;
1485	}
1486
1487	/ Record old state, apply changes to ->qsmaskinit field. /
1488	oldmask = rnp->qsmaskinit;
1489	rnp->qsmaskinit = rnp->qsmaskinitnext;
1490
1491	/ If zero-ness of ->qsmaskinit changed, propagate up tree. /
1492	if (!oldmask != !rnp->qsmaskinit) {
1493	if (!oldmask) { / First online CPU for rcu_node. /
1494	if (!rnp->wait_blkd_tasks) / Ever offline? /
1495	rcu_init_new_rnp(rnp_leaf: rnp);
1496	} else if (rcu_preempt_has_tasks(rnp)) {
1497	rnp->wait_blkd_tasks = true; / blocked tasks /
1498	} else { / Last offline CPU and can propagate. /
1499	rcu_cleanup_dead_rnp(rnp_leaf: rnp);
1500	}
1501	}
1502
1503	/*
1504	* If all waited-on tasks from prior grace period are
1505	* done, and if all this rcu_node structure's CPUs are
1506	* still offline, propagate up the rcu_node tree and
1507	* clear ->wait_blkd_tasks. Otherwise, if one of this
1508	* rcu_node structure's CPUs has since come back online,
1509	* simply clear ->wait_blkd_tasks.
1510	*/
1511	if (rnp->wait_blkd_tasks &&
1512	(!rcu_preempt_has_tasks(rnp) \|\| rnp->qsmaskinit)) {
1513	rnp->wait_blkd_tasks = false;
1514	if (!rnp->qsmaskinit)
1515	rcu_cleanup_dead_rnp(rnp_leaf: rnp);
1516	}
1517
1518	raw_spin_unlock_rcu_node(rnp);
1519	arch_spin_unlock(&rcu_state.ofl_lock);
1520	local_irq_restore(flags);
1521	}
1522	rcu_gp_slow(delay: gp_preinit_delay); / Races with CPU hotplug. /
1523
1524	/*
1525	* Set the quiescent-state-needed bits in all the rcu_node
1526	* structures for all currently online CPUs in breadth-first
1527	* order, starting from the root rcu_node structure, relying on the
1528	* layout of the tree within the rcu_state.node[] array. Note that
1529	* other CPUs will access only the leaves of the hierarchy, thus
1530	* seeing that no grace period is in progress, at least until the
1531	* corresponding leaf node has been initialized.
1532	*
1533	* The grace period cannot complete until the initialization
1534	* process finishes, because this kthread handles both.
1535	*/
1536	WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT);
1537	rcu_for_each_node_breadth_first(rnp) {
1538	rcu_gp_slow(delay: gp_init_delay);
1539	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1540	rdp = this_cpu_ptr(&rcu_data);
1541	rcu_preempt_check_blocked_tasks(rnp);
1542	rnp->qsmask = rnp->qsmaskinit;
1543	WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq);
1544	if (rnp == rdp->mynode)
1545	(void)__note_gp_changes(rnp, rdp);
1546	rcu_preempt_boost_start_gp(rnp);
1547	trace_rcu_grace_period_init(rcuname: rcu_state.name, gp_seq: rnp->gp_seq,
1548	level: rnp->level, grplo: rnp->grplo,
1549	grphi: rnp->grphi, qsmask: rnp->qsmask);
1550	/ Quiescent states for tasks on any now-offline CPUs. /
1551	mask = rnp->qsmask & ~rnp->qsmaskinitnext;
1552	rnp->rcu_gp_init_mask = mask;
1553	if ((mask \|\| rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp))
1554	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
1555	else
1556	raw_spin_unlock_irq_rcu_node(rnp);
1557	cond_resched_tasks_rcu_qs();
1558	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1559	}
1560
1561	// If strict, make all CPUs aware of new grace period.
1562	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
1563	on_each_cpu(func: rcu_strict_gp_boundary, NULL, wait: `0`);
1564
1565	return true;
1566	}
1567
1568	/*
1569	* Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
1570	* time.
1571	*/
1572	static bool rcu_gp_fqs_check_wake(int *gfp)
1573	{
1574	struct rcu_node *rnp = rcu_get_root();
1575
1576	// If under overload conditions, force an immediate FQS scan.
1577	if (*gfp & RCU_GP_FLAG_OVLD)
1578	return true;
1579
1580	// Someone like call_rcu() requested a force-quiescent-state scan.
1581	*gfp = READ_ONCE(rcu_state.gp_flags);
1582	if (*gfp & RCU_GP_FLAG_FQS)
1583	return true;
1584
1585	// The current grace period has completed.
1586	if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1587	return true;
1588
1589	return false;
1590	}
1591
1592	/*
1593	* Do one round of quiescent-state forcing.
1594	*/
1595	static void rcu_gp_fqs(bool first_time)
1596	{
1597	int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
1598	struct rcu_node *rnp = rcu_get_root();
1599
1600	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1601	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + `1`);
1602
1603	WARN_ON_ONCE(nr_fqs > `3`);
1604	/ Only countdown nr_fqs for stall purposes if jiffies moves. /
1605	if (nr_fqs) {
1606	if (nr_fqs == `1`) {
1607	WRITE_ONCE(rcu_state.jiffies_stall,
1608	jiffies + rcu_jiffies_till_stall_check());
1609	}
1610	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
1611	}
1612
1613	if (first_time) {
1614	/ Collect dyntick-idle snapshots. /
1615	force_qs_rnp(f: dyntick_save_progress_counter);
1616	} else {
1617	/ Handle dyntick-idle and offline CPUs. /
1618	force_qs_rnp(f: rcu_implicit_dynticks_qs);
1619	}
1620	/ Clear flag to prevent immediate re-entry. /
1621	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
1622	raw_spin_lock_irq_rcu_node(rnp);
1623	WRITE_ONCE(rcu_state.gp_flags,
1624	READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS);
1625	raw_spin_unlock_irq_rcu_node(rnp);
1626	}
1627	}
1628
1629	/*
1630	* Loop doing repeated quiescent-state forcing until the grace period ends.
1631	*/
1632	static noinline_for_stack void rcu_gp_fqs_loop(void)
1633	{
1634	bool first_gp_fqs = true;
1635	int gf = `0`;
1636	unsigned long j;
1637	int ret;
1638	struct rcu_node *rnp = rcu_get_root();
1639
1640	j = READ_ONCE(jiffies_till_first_fqs);
1641	if (rcu_state.cbovld)
1642	gf = RCU_GP_FLAG_OVLD;
1643	ret = `0`;
1644	for (;;) {
1645	if (rcu_state.cbovld) {
1646	j = (j + `2`) / `3`;
1647	if (j <= `0`)
1648	j = `1`;
1649	}
1650	if (!ret \|\| time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
1651	WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
1652	/*
1653	* jiffies_force_qs before RCU_GP_WAIT_FQS state
1654	* update; required for stall checks.
1655	*/
1656	smp_wmb();
1657	WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
1658	jiffies + (j ? `3` * j : `2`));
1659	}
1660	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1661	TPS("fqswait"));
1662	WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
1663	(void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
1664	rcu_gp_fqs_check_wake(&gf), j);
1665	rcu_gp_torture_wait();
1666	WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
1667	/ Locking provides needed memory barriers. /
1668	/*
1669	* Exit the loop if the root rcu_node structure indicates that the grace period
1670	* has ended, leave the loop. The rcu_preempt_blocked_readers_cgp(rnp) check
1671	* is required only for single-node rcu_node trees because readers blocking
1672	* the current grace period are queued only on leaf rcu_node structures.
1673	* For multi-node trees, checking the root node's ->qsmask suffices, because a
1674	* given root node's ->qsmask bit is cleared only when all CPUs and tasks from
1675	* the corresponding leaf nodes have passed through their quiescent state.
1676	*/
1677	if (!READ_ONCE(rnp->qsmask) &&
1678	!rcu_preempt_blocked_readers_cgp(rnp))
1679	break;
1680	/ If time for quiescent-state forcing, do it. /
1681	if (!time_after(rcu_state.jiffies_force_qs, jiffies) \|\|
1682	(gf & (RCU_GP_FLAG_FQS \| RCU_GP_FLAG_OVLD))) {
1683	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1684	TPS("fqsstart"));
1685	rcu_gp_fqs(first_time: first_gp_fqs);
1686	gf = `0`;
1687	if (first_gp_fqs) {
1688	first_gp_fqs = false;
1689	gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : `0`;
1690	}
1691	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1692	TPS("fqsend"));
1693	cond_resched_tasks_rcu_qs();
1694	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1695	ret = `0`; / Force full wait till next FQS. /
1696	j = READ_ONCE(jiffies_till_next_fqs);
1697	} else {
1698	/ Deal with stray signal. /
1699	cond_resched_tasks_rcu_qs();
1700	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1701	WARN_ON(signal_pending(current));
1702	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1703	TPS("fqswaitsig"));
1704	ret = `1`; / Keep old FQS timing. /
1705	j = jiffies;
1706	if (time_after(jiffies, rcu_state.jiffies_force_qs))
1707	j = `1`;
1708	else
1709	j = rcu_state.jiffies_force_qs - j;
1710	gf = `0`;
1711	}
1712	}
1713	}
1714
1715	/*
1716	* Clean up after the old grace period.
1717	*/
1718	static noinline void rcu_gp_cleanup(void)
1719	{
1720	int cpu;
1721	bool needgp = false;
1722	unsigned long gp_duration;
1723	unsigned long new_gp_seq;
1724	bool offloaded;
1725	struct rcu_data *rdp;
1726	struct rcu_node *rnp = rcu_get_root();
1727	struct swait_queue_head *sq;
1728
1729	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1730	raw_spin_lock_irq_rcu_node(rnp);
1731	rcu_state.gp_end = jiffies;
1732	gp_duration = rcu_state.gp_end - rcu_state.gp_start;
1733	if (gp_duration > rcu_state.gp_max)
1734	rcu_state.gp_max = gp_duration;
1735
1736	/*
1737	* We know the grace period is complete, but to everyone else
1738	* it appears to still be ongoing. But it is also the case
1739	* that to everyone else it looks like there is nothing that
1740	* they can do to advance the grace period. It is therefore
1741	* safe for us to drop the lock in order to mark the grace
1742	* period as completed in all of the rcu_node structures.
1743	*/
1744	rcu_poll_gp_seq_end(snap: &rcu_state.gp_seq_polled_snap);
1745	raw_spin_unlock_irq_rcu_node(rnp);
1746
1747	/*
1748	* Propagate new ->gp_seq value to rcu_node structures so that
1749	* other CPUs don't have to wait until the start of the next grace
1750	* period to process their callbacks. This also avoids some nasty
1751	* RCU grace-period initialization races by forcing the end of
1752	* the current grace period to be completely recorded in all of
1753	* the rcu_node structures before the beginning of the next grace
1754	* period is recorded in any of the rcu_node structures.
1755	*/
1756	new_gp_seq = rcu_state.gp_seq;
1757	rcu_seq_end(sp: &new_gp_seq);
1758	rcu_for_each_node_breadth_first(rnp) {
1759	raw_spin_lock_irq_rcu_node(rnp);
1760	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
1761	dump_blkd_tasks(rnp, ncheck: `10`);
1762	WARN_ON_ONCE(rnp->qsmask);
1763	WRITE_ONCE(rnp->gp_seq, new_gp_seq);
1764	if (!rnp->parent)
1765	smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
1766	rdp = this_cpu_ptr(&rcu_data);
1767	if (rnp == rdp->mynode)
1768	needgp = __note_gp_changes(rnp, rdp) \|\| needgp;
1769	/ smp_mb() provided by prior unlock-lock pair. /
1770	needgp = rcu_future_gp_cleanup(rnp) \|\| needgp;
1771	// Reset overload indication for CPUs no longer overloaded
1772	if (rcu_is_leaf_node(rnp))
1773	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
1774	rdp = per_cpu_ptr(&rcu_data, cpu);
1775	check_cb_ovld_locked(rdp, rnp);
1776	}
1777	sq = rcu_nocb_gp_get(rnp);
1778	raw_spin_unlock_irq_rcu_node(rnp);
1779	rcu_nocb_gp_cleanup(sq);
1780	cond_resched_tasks_rcu_qs();
1781	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1782	rcu_gp_slow(delay: gp_cleanup_delay);
1783	}
1784	rnp = rcu_get_root();
1785	raw_spin_lock_irq_rcu_node(rnp); / GP before ->gp_seq update. /
1786
1787	/ Declare grace period done, trace first to use old GP number. /
1788	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("end"));
1789	rcu_seq_end(sp: &rcu_state.gp_seq);
1790	ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
1791	WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE);
1792	/ Check for GP requests since above loop. /
1793	rdp = this_cpu_ptr(&rcu_data);
1794	if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) {
1795	trace_rcu_this_gp(rnp, rdp, gp_seq_req: rnp->gp_seq_needed,
1796	TPS("CleanupMore"));
1797	needgp = true;
1798	}
1799	/ Advance CBs to reduce false positives below. /
1800	offloaded = rcu_rdp_is_offloaded(rdp);
1801	if ((offloaded \|\| !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
1802
1803	// We get here if a grace period was needed (“needgp”)
1804	// and the above call to rcu_accelerate_cbs() did not set
1805	// the RCU_GP_FLAG_INIT bit in ->gp_state (which records
1806	// the need for another grace period). The purpose
1807	// of the “offloaded” check is to avoid invoking
1808	// rcu_accelerate_cbs() on an offloaded CPU because we do not
1809	// hold the ->nocb_lock needed to safely access an offloaded
1810	// ->cblist. We do not want to acquire that lock because
1811	// it can be heavily contended during callback floods.
1812
1813	WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
1814	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
1815	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq, TPS("newreq"));
1816	} else {
1817
1818	// We get here either if there is no need for an
1819	// additional grace period or if rcu_accelerate_cbs() has
1820	// already set the RCU_GP_FLAG_INIT bit in ->gp_flags.
1821	// So all we need to do is to clear all of the other
1822	// ->gp_flags bits.
1823
1824	WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & RCU_GP_FLAG_INIT);
1825	}
1826	raw_spin_unlock_irq_rcu_node(rnp);
1827
1828	// If strict, make all CPUs aware of the end of the old grace period.
1829	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
1830	on_each_cpu(func: rcu_strict_gp_boundary, NULL, wait: `0`);
1831	}
1832
1833	/*
1834	* Body of kthread that handles grace periods.
1835	*/
1836	static int __noreturn rcu_gp_kthread(void *unused)
1837	{
1838	rcu_bind_gp_kthread();
1839	for (;;) {
1840
1841	/ Handle grace-period start. /
1842	for (;;) {
1843	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1844	TPS("reqwait"));
1845	WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS);
1846	swait_event_idle_exclusive(rcu_state.gp_wq,
1847	READ_ONCE(rcu_state.gp_flags) &
1848	RCU_GP_FLAG_INIT);
1849	rcu_gp_torture_wait();
1850	WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS);
1851	/ Locking provides needed memory barrier. /
1852	if (rcu_gp_init())
1853	break;
1854	cond_resched_tasks_rcu_qs();
1855	WRITE_ONCE(rcu_state.gp_activity, jiffies);
1856	WARN_ON(signal_pending(current));
1857	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rcu_state.gp_seq,
1858	TPS("reqwaitsig"));
1859	}
1860
1861	/ Handle quiescent-state forcing. /
1862	rcu_gp_fqs_loop();
1863
1864	/ Handle grace-period end. /
1865	WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP);
1866	rcu_gp_cleanup();
1867	WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED);
1868	}
1869	}
1870
1871	/*
1872	* Report a full set of quiescent states to the rcu_state data structure.
1873	* Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if
1874	* another grace period is required. Whether we wake the grace-period
1875	* kthread or it awakens itself for the next round of quiescent-state
1876	* forcing, that kthread will clean up after the just-completed grace
1877	* period. Note that the caller must hold rnp->lock, which is released
1878	* before return.
1879	*/
1880	static void rcu_report_qs_rsp(unsigned long flags)
1881	__releases(rcu_get_root()->lock)
1882	{
1883	raw_lockdep_assert_held_rcu_node(rcu_get_root());
1884	WARN_ON_ONCE(!rcu_gp_in_progress());
1885	WRITE_ONCE(rcu_state.gp_flags,
1886	READ_ONCE(rcu_state.gp_flags) \| RCU_GP_FLAG_FQS);
1887	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags);
1888	rcu_gp_kthread_wake();
1889	}
1890
1891	/*
1892	* Similar to rcu_report_qs_rdp(), for which it is a helper function.
1893	* Allows quiescent states for a group of CPUs to be reported at one go
1894	* to the specified rcu_node structure, though all the CPUs in the group
1895	* must be represented by the same rcu_node structure (which need not be a
1896	* leaf rcu_node structure, though it often will be). The gps parameter
1897	* is the grace-period snapshot, which means that the quiescent states
1898	* are valid only if rnp->gp_seq is equal to gps. That structure's lock
1899	* must be held upon entry, and it is released before return.
1900	*
1901	* As a special case, if mask is zero, the bit-already-cleared check is
1902	* disabled. This allows propagating quiescent state due to resumed tasks
1903	* during grace-period initialization.
1904	*/
1905	static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
1906	unsigned long gps, unsigned long flags)
1907	__releases(rnp->lock)
1908	{
1909	unsigned long oldmask = `0`;
1910	struct rcu_node *rnp_c;
1911
1912	raw_lockdep_assert_held_rcu_node(rnp);
1913
1914	/ Walk up the rcu_node hierarchy. /
1915	for (;;) {
1916	if ((!(rnp->qsmask & mask) && mask) \|\| rnp->gp_seq != gps) {
1917
1918	/*
1919	* Our bit has already been cleared, or the
1920	* relevant grace period is already over, so done.
1921	*/
1922	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1923	return;
1924	}
1925	WARN_ON_ONCE(oldmask); / Any child must be all zeroed! /
1926	WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
1927	rcu_preempt_blocked_readers_cgp(rnp));
1928	WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
1929	trace_rcu_quiescent_state_report(rcuname: rcu_state.name, gp_seq: rnp->gp_seq,
1930	mask, qsmask: rnp->qsmask, level: rnp->level,
1931	grplo: rnp->grplo, grphi: rnp->grphi,
1932	gp_tasks: !!rnp->gp_tasks);
1933	if (rnp->qsmask != `0` \|\| rcu_preempt_blocked_readers_cgp(rnp)) {
1934
1935	/ Other bits still set at this level, so done. /
1936	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1937	return;
1938	}
1939	rnp->completedqs = rnp->gp_seq;
1940	mask = rnp->grpmask;
1941	if (rnp->parent == NULL) {
1942
1943	/ No more levels. Exit loop holding root lock. /
1944
1945	break;
1946	}
1947	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1948	rnp_c = rnp;
1949	rnp = rnp->parent;
1950	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1951	oldmask = READ_ONCE(rnp_c->qsmask);
1952	}
1953
1954	/*
1955	* Get here if we are the last CPU to pass through a quiescent
1956	* state for this grace period. Invoke rcu_report_qs_rsp()
1957	* to clean up and start the next grace period if one is needed.
1958	*/
1959	rcu_report_qs_rsp(flags); / releases rnp->lock. /
1960	}
1961
1962	/*
1963	* Record a quiescent state for all tasks that were previously queued
1964	* on the specified rcu_node structure and that were blocking the current
1965	* RCU grace period. The caller must hold the corresponding rnp->lock with
1966	* irqs disabled, and this lock is released upon return, but irqs remain
1967	* disabled.
1968	*/
1969	static void __maybe_unused
1970	rcu_report_unblock_qs_rnp(struct rcu_node rnp, unsigned* long flags)
1971	__releases(rnp->lock)
1972	{
1973	unsigned long gps;
1974	unsigned long mask;
1975	struct rcu_node *rnp_p;
1976
1977	raw_lockdep_assert_held_rcu_node(rnp);
1978	if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) \|\|
1979	WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) \|\|
1980	rnp->qsmask != `0`) {
1981	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1982	return; / Still need more quiescent states! /
1983	}
1984
1985	rnp->completedqs = rnp->gp_seq;
1986	rnp_p = rnp->parent;
1987	if (rnp_p == NULL) {
1988	/*
1989	* Only one rcu_node structure in the tree, so don't
1990	* try to report up to its nonexistent parent!
1991	*/
1992	rcu_report_qs_rsp(flags);
1993	return;
1994	}
1995
1996	/ Report up the rest of the hierarchy, tracking current ->gp_seq. /
1997	gps = rnp->gp_seq;
1998	mask = rnp->grpmask;
1999	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
2000	raw_spin_lock_rcu_node(rnp_p); / irqs already disabled. /
2001	rcu_report_qs_rnp(mask, rnp: rnp_p, gps, flags);
2002	}
2003
2004	/*
2005	* Record a quiescent state for the specified CPU to that CPU's rcu_data
2006	* structure. This must be called from the specified CPU.
2007	*/
2008	static void
2009	rcu_report_qs_rdp(struct rcu_data *rdp)
2010	{
2011	unsigned long flags;
2012	unsigned long mask;
2013	bool needacc = false;
2014	struct rcu_node *rnp;
2015
2016	WARN_ON_ONCE(rdp->cpu != smp_processor_id());
2017	rnp = rdp->mynode;
2018	raw_spin_lock_irqsave_rcu_node(rnp, flags);
2019	if (rdp->cpu_no_qs.b.norm \|\| rdp->gp_seq != rnp->gp_seq \|\|
2020	rdp->gpwrap) {
2021
2022	/*
2023	* The grace period in which this quiescent state was
2024	* recorded has ended, so don't report it upwards.
2025	* We will instead need a new quiescent state that lies
2026	* within the current grace period.
2027	*/
2028	rdp->cpu_no_qs.b.norm = true; / need qs for new gp. /
2029	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2030	return;
2031	}
2032	mask = rdp->grpmask;
2033	rdp->core_needs_qs = false;
2034	if ((rnp->qsmask & mask) == `0`) {
2035	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2036	} else {
2037	/*
2038	* This GP can't end until cpu checks in, so all of our
2039	* callbacks can be processed during the next GP.
2040	*
2041	* NOCB kthreads have their own way to deal with that...
2042	*/
2043	if (!rcu_rdp_is_offloaded(rdp)) {
2044	/*
2045	* The current GP has not yet ended, so it
2046	* should not be possible for rcu_accelerate_cbs()
2047	* to return true. So complain, but don't awaken.
2048	*/
2049	WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
2050	} else if (!rcu_segcblist_completely_offloaded(rsclp: &rdp->cblist)) {
2051	/*
2052	* ...but NOCB kthreads may miss or delay callbacks acceleration
2053	* if in the middle of a (de-)offloading process.
2054	*/
2055	needacc = true;
2056	}
2057
2058	rcu_disable_urgency_upon_qs(rdp);
2059	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
2060	/ ^^^ Released rnp->lock /
2061
2062	if (needacc) {
2063	rcu_nocb_lock_irqsave(rdp, flags);
2064	rcu_accelerate_cbs_unlocked(rnp, rdp);
2065	rcu_nocb_unlock_irqrestore(rdp, flags);
2066	}
2067	}
2068	}
2069
2070	/*
2071	* Check to see if there is a new grace period of which this CPU
2072	* is not yet aware, and if so, set up local rcu_data state for it.
2073	* Otherwise, see if this CPU has just passed through its first
2074	* quiescent state for this grace period, and record that fact if so.
2075	*/
2076	static void
2077	rcu_check_quiescent_state(struct rcu_data *rdp)
2078	{
2079	/ Check for grace-period ends and beginnings. /
2080	note_gp_changes(rdp);
2081
2082	/*
2083	* Does this CPU still need to do its part for current grace period?
2084	* If no, return and let the other CPUs do their part as well.
2085	*/
2086	if (!rdp->core_needs_qs)
2087	return;
2088
2089	/*
2090	* Was there a quiescent state since the beginning of the grace
2091	* period? If no, then exit and wait for the next call.
2092	*/
2093	if (rdp->cpu_no_qs.b.norm)
2094	return;
2095
2096	/*
2097	* Tell RCU we are done (but rcu_report_qs_rdp() will be the
2098	* judge of that).
2099	*/
2100	rcu_report_qs_rdp(rdp);
2101	}
2102
2103	/ Return true if callback-invocation time limit exceeded. /
2104	static bool rcu_do_batch_check_time(long count, long tlimit,
2105	bool jlimit_check, unsigned long jlimit)
2106	{
2107	// Invoke local_clock() only once per 32 consecutive callbacks.
2108	return unlikely(tlimit) &&
2109	(!likely(count & `31`) \|\|
2110	(IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) &&
2111	jlimit_check && time_after(jiffies, jlimit))) &&
2112	local_clock() >= tlimit;
2113	}
2114
2115	/*
2116	* Invoke any RCU callbacks that have made it to the end of their grace
2117	* period. Throttle as specified by rdp->blimit.
2118	*/
2119	static void rcu_do_batch(struct rcu_data *rdp)
2120	{
2121	long bl;
2122	long count = `0`;
2123	int div;
2124	bool __maybe_unused empty;
2125	unsigned long flags;
2126	unsigned long jlimit;
2127	bool jlimit_check = false;
2128	long pending;
2129	struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2130	struct rcu_head *rhp;
2131	long tlimit = `0`;
2132
2133	/ If no callbacks are ready, just return. /
2134	if (!rcu_segcblist_ready_cbs(rsclp: &rdp->cblist)) {
2135	trace_rcu_batch_start(rcuname: rcu_state.name,
2136	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist), blimit: `0`);
2137	trace_rcu_batch_end(rcuname: rcu_state.name, callbacks_invoked: `0`,
2138	cb: !rcu_segcblist_empty(rsclp: &rdp->cblist),
2139	nr: need_resched(), iit: is_idle_task(current),
2140	risk: rcu_is_callbacks_kthread(rdp));
2141	return;
2142	}
2143
2144	/*
2145	* Extract the list of ready callbacks, disabling IRQs to prevent
2146	* races with call_rcu() from interrupt handlers. Leave the
2147	* callback counts, as rcu_barrier() needs to be conservative.
2148	*
2149	* Callbacks execution is fully ordered against preceding grace period
2150	* completion (materialized by rnp->gp_seq update) thanks to the
2151	* smp_mb__after_unlock_lock() upon node locking required for callbacks
2152	* advancing. In NOCB mode this ordering is then further relayed through
2153	* the nocb locking that protects both callbacks advancing and extraction.
2154	*/
2155	rcu_nocb_lock_irqsave(rdp, flags);
2156	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2157	pending = rcu_segcblist_get_seglen(rsclp: &rdp->cblist, RCU_DONE_TAIL);
2158	div = READ_ONCE(rcu_divisor);
2159	div = div < `0` ? `7` : div > sizeof(long) * `8` - `2` ? sizeof(long) * `8` - `2` : div;
2160	bl = max(rdp->blimit, pending >> div);
2161	if ((in_serving_softirq() \|\| rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING) &&
2162	(IS_ENABLED(CONFIG_RCU_DOUBLE_CHECK_CB_TIME) \|\| unlikely(bl > `100`))) {
2163	const long npj = NSEC_PER_SEC / HZ;
2164	long rrn = READ_ONCE(rcu_resched_ns);
2165
2166	rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
2167	tlimit = local_clock() + rrn;
2168	jlimit = jiffies + (rrn + npj + `1`) / npj;
2169	jlimit_check = true;
2170	}
2171	trace_rcu_batch_start(rcuname: rcu_state.name,
2172	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist), blimit: bl);
2173	rcu_segcblist_extract_done_cbs(rsclp: &rdp->cblist, rclp: &rcl);
2174	if (rcu_rdp_is_offloaded(rdp))
2175	rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2176
2177	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCbDequeued"));
2178	rcu_nocb_unlock_irqrestore(rdp, flags);
2179
2180	/ Invoke callbacks. /
2181	tick_dep_set_task(current, bit: TICK_DEP_BIT_RCU);
2182	rhp = rcu_cblist_dequeue(rclp: &rcl);
2183
2184	for (; rhp; rhp = rcu_cblist_dequeue(rclp: &rcl)) {
2185	rcu_callback_t f;
2186
2187	count++;
2188	debug_rcu_head_unqueue(head: rhp);
2189
2190	rcu_lock_acquire(map: &rcu_callback_map);
2191	trace_rcu_invoke_callback(rcuname: rcu_state.name, rhp);
2192
2193	f = rhp->func;
2194	debug_rcu_head_callback(rhp);
2195	WRITE_ONCE(rhp->func, (rcu_callback_t)`0L`);
2196	f(rhp);
2197
2198	rcu_lock_release(map: &rcu_callback_map);
2199
2200	/*
2201	* Stop only if limit reached and CPU has something to do.
2202	*/
2203	if (in_serving_softirq()) {
2204	if (count >= bl && (need_resched() \|\| !is_idle_task(current)))
2205	break;
2206	/*
2207	* Make sure we don't spend too much time here and deprive other
2208	* softirq vectors of CPU cycles.
2209	*/
2210	if (rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit))
2211	break;
2212	} else {
2213	// In rcuc/rcuoc context, so no worries about
2214	// depriving other softirq vectors of CPU cycles.
2215	local_bh_enable();
2216	lockdep_assert_irqs_enabled();
2217	cond_resched_tasks_rcu_qs();
2218	lockdep_assert_irqs_enabled();
2219	local_bh_disable();
2220	// But rcuc kthreads can delay quiescent-state
2221	// reporting, so check time limits for them.
2222	if (rdp->rcu_cpu_kthread_status == RCU_KTHREAD_RUNNING &&
2223	rcu_do_batch_check_time(count, tlimit, jlimit_check, jlimit)) {
2224	rdp->rcu_cpu_has_work = `1`;
2225	break;
2226	}
2227	}
2228	}
2229
2230	rcu_nocb_lock_irqsave(rdp, flags);
2231	rdp->n_cbs_invoked += count;
2232	trace_rcu_batch_end(rcuname: rcu_state.name, callbacks_invoked: count, cb: !!rcl.head, nr: need_resched(),
2233	iit: is_idle_task(current), risk: rcu_is_callbacks_kthread(rdp));
2234
2235	/ Update counts and requeue any remaining callbacks. /
2236	rcu_segcblist_insert_done_cbs(rsclp: &rdp->cblist, rclp: &rcl);
2237	rcu_segcblist_add_len(rsclp: &rdp->cblist, v: -count);
2238
2239	/ Reinstate batch limit if we have worked down the excess. /
2240	count = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2241	if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
2242	rdp->blimit = blimit;
2243
2244	/ Reset ->qlen_last_fqs_check trigger if enough CBs have drained. /
2245	if (count == `0` && rdp->qlen_last_fqs_check != `0`) {
2246	rdp->qlen_last_fqs_check = `0`;
2247	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2248	} else if (count < rdp->qlen_last_fqs_check - qhimark)
2249	rdp->qlen_last_fqs_check = count;
2250
2251	/*
2252	* The following usually indicates a double call_rcu(). To track
2253	* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2254	*/
2255	empty = rcu_segcblist_empty(rsclp: &rdp->cblist);
2256	WARN_ON_ONCE(count == `0` && !empty);
2257	WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2258	count != `0` && empty);
2259	WARN_ON_ONCE(count == `0` && rcu_segcblist_n_segment_cbs(&rdp->cblist) != `0`);
2260	WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == `0`);
2261
2262	rcu_nocb_unlock_irqrestore(rdp, flags);
2263
2264	tick_dep_clear_task(current, bit: TICK_DEP_BIT_RCU);
2265	}
2266
2267	/*
2268	* This function is invoked from each scheduling-clock interrupt,
2269	* and checks to see if this CPU is in a non-context-switch quiescent
2270	* state, for example, user mode or idle loop. It also schedules RCU
2271	* core processing. If the current grace period has gone on too long,
2272	* it will ask the scheduler to manufacture a context switch for the sole
2273	* purpose of providing the needed quiescent state.
2274	*/
2275	void rcu_sched_clock_irq(int user)
2276	{
2277	unsigned long j;
2278
2279	if (IS_ENABLED(CONFIG_PROVE_RCU)) {
2280	j = jiffies;
2281	WARN_ON_ONCE(time_before(j, __this_cpu_read(rcu_data.last_sched_clock)));
2282	__this_cpu_write(rcu_data.last_sched_clock, j);
2283	}
2284	trace_rcu_utilization(TPS("Start scheduler-tick"));
2285	lockdep_assert_irqs_disabled();
2286	raw_cpu_inc(rcu_data.ticks_this_gp);
2287	/ The load-acquire pairs with the store-release setting to true. /
2288	if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
2289	/ Idle and userspace execution already are quiescent states. /
2290	if (!rcu_is_cpu_rrupt_from_idle() && !user) {
2291	set_tsk_need_resched(current);
2292	set_preempt_need_resched();
2293	}
2294	__this_cpu_write(rcu_data.rcu_urgent_qs, false);
2295	}
2296	rcu_flavor_sched_clock_irq(user);
2297	if (rcu_pending(user))
2298	invoke_rcu_core();
2299	if (user \|\| rcu_is_cpu_rrupt_from_idle())
2300	rcu_note_voluntary_context_switch(current);
2301	lockdep_assert_irqs_disabled();
2302
2303	trace_rcu_utilization(TPS("End scheduler-tick"));
2304	}
2305
2306	/*
2307	* Scan the leaf rcu_node structures. For each structure on which all
2308	* CPUs have reported a quiescent state and on which there are tasks
2309	* blocking the current grace period, initiate RCU priority boosting.
2310	* Otherwise, invoke the specified function to check dyntick state for
2311	* each CPU that has not yet reported a quiescent state.
2312	*/
2313	static void force_qs_rnp(int (f)(struct* rcu_data *rdp))
2314	{
2315	int cpu;
2316	unsigned long flags;
2317	struct rcu_node *rnp;
2318
2319	rcu_state.cbovld = rcu_state.cbovldnext;
2320	rcu_state.cbovldnext = false;
2321	rcu_for_each_leaf_node(rnp) {
2322	unsigned long mask = `0`;
2323	unsigned long rsmask = `0`;
2324
2325	cond_resched_tasks_rcu_qs();
2326	raw_spin_lock_irqsave_rcu_node(rnp, flags);
2327	rcu_state.cbovldnext \|= !!rnp->cbovldmask;
2328	if (rnp->qsmask == `0`) {
2329	if (rcu_preempt_blocked_readers_cgp(rnp)) {
2330	/*
2331	* No point in scanning bits because they
2332	* are all zero. But we might need to
2333	* priority-boost blocked readers.
2334	*/
2335	rcu_initiate_boost(rnp, flags);
2336	/ rcu_initiate_boost() releases rnp->lock /
2337	continue;
2338	}
2339	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2340	continue;
2341	}
2342	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
2343	struct rcu_data *rdp;
2344	int ret;
2345
2346	rdp = per_cpu_ptr(&rcu_data, cpu);
2347	ret = f(rdp);
2348	if (ret > `0`) {
2349	mask \|= rdp->grpmask;
2350	rcu_disable_urgency_upon_qs(rdp);
2351	}
2352	if (ret < `0`)
2353	rsmask \|= rdp->grpmask;
2354	}
2355	if (mask != `0`) {
2356	/ Idle/offline CPUs, report (releases rnp->lock). /
2357	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
2358	} else {
2359	/ Nothing to do here, so just drop the lock. /
2360	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
2361	}
2362
2363	for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
2364	resched_cpu(cpu);
2365	}
2366	}
2367
2368	/*
2369	* Force quiescent states on reluctant CPUs, and also detect which
2370	* CPUs are in dyntick-idle mode.
2371	*/
2372	void rcu_force_quiescent_state(void)
2373	{
2374	unsigned long flags;
2375	bool ret;
2376	struct rcu_node *rnp;
2377	struct rcu_node *rnp_old = NULL;
2378
2379	if (!rcu_gp_in_progress())
2380	return;
2381	/ Funnel through hierarchy to reduce memory contention. /
2382	rnp = raw_cpu_read(rcu_data.mynode);
2383	for (; rnp != NULL; rnp = rnp->parent) {
2384	ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) \|\|
2385	!raw_spin_trylock(&rnp->fqslock);
2386	if (rnp_old != NULL)
2387	raw_spin_unlock(&rnp_old->fqslock);
2388	if (ret)
2389	return;
2390	rnp_old = rnp;
2391	}
2392	/ rnp_old == rcu_get_root(), rnp == NULL. /
2393
2394	/ Reached the root of the rcu_node tree, acquire lock. /
2395	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
2396	raw_spin_unlock(&rnp_old->fqslock);
2397	if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) {
2398	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2399	return; / Someone beat us to it. /
2400	}
2401	WRITE_ONCE(rcu_state.gp_flags,
2402	READ_ONCE(rcu_state.gp_flags) \| RCU_GP_FLAG_FQS);
2403	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
2404	rcu_gp_kthread_wake();
2405	}
2406	EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
2407
2408	// Workqueue handler for an RCU reader for kernels enforcing struct RCU
2409	// grace periods.
2410	static void strict_work_handler(struct work_struct *work)
2411	{
2412	rcu_read_lock();
2413	rcu_read_unlock();
2414	}
2415
2416	/ Perform RCU core processing work for the current CPU. /
2417	static __latent_entropy void rcu_core(void)
2418	{
2419	unsigned long flags;
2420	struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2421	struct rcu_node *rnp = rdp->mynode;
2422	/*
2423	* On RT rcu_core() can be preempted when IRQs aren't disabled.
2424	* Therefore this function can race with concurrent NOCB (de-)offloading
2425	* on this CPU and the below condition must be considered volatile.
2426	* However if we race with:
2427	*
2428	* _ Offloading: In the worst case we accelerate or process callbacks
2429	* concurrently with NOCB kthreads. We are guaranteed to
2430	* call rcu_nocb_lock() if that happens.
2431	*
2432	* _ Deoffloading: In the worst case we miss callbacks acceleration or
2433	* processing. This is fine because the early stage
2434	* of deoffloading invokes rcu_core() after setting
2435	* SEGCBLIST_RCU_CORE. So we guarantee that we'll process
2436	* what could have been dismissed without the need to wait
2437	* for the next rcu_pending() check in the next jiffy.
2438	*/
2439	const bool do_batch = !rcu_segcblist_completely_offloaded(rsclp: &rdp->cblist);
2440
2441	if (cpu_is_offline(smp_processor_id()))
2442	return;
2443	trace_rcu_utilization(TPS("Start RCU core"));
2444	WARN_ON_ONCE(!rdp->beenonline);
2445
2446	/ Report any deferred quiescent states if preemption enabled. /
2447	if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) {
2448	rcu_preempt_deferred_qs(current);
2449	} else if (rcu_preempt_need_deferred_qs(current)) {
2450	set_tsk_need_resched(current);
2451	set_preempt_need_resched();
2452	}
2453
2454	/ Update RCU state based on any recent quiescent states. /
2455	rcu_check_quiescent_state(rdp);
2456
2457	/ No grace period and unregistered callbacks? /
2458	if (!rcu_gp_in_progress() &&
2459	rcu_segcblist_is_enabled(rsclp: &rdp->cblist) && do_batch) {
2460	rcu_nocb_lock_irqsave(rdp, flags);
2461	if (!rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_NEXT_READY_TAIL))
2462	rcu_accelerate_cbs_unlocked(rnp, rdp);
2463	rcu_nocb_unlock_irqrestore(rdp, flags);
2464	}
2465
2466	rcu_check_gp_start_stall(rnp, rdp, gpssdelay: rcu_jiffies_till_stall_check());
2467
2468	/ If there are callbacks ready, invoke them. /
2469	if (do_batch && rcu_segcblist_ready_cbs(rsclp: &rdp->cblist) &&
2470	likely(READ_ONCE(rcu_scheduler_fully_active))) {
2471	rcu_do_batch(rdp);
2472	/ Re-invoke RCU core processing if there are callbacks remaining. /
2473	if (rcu_segcblist_ready_cbs(rsclp: &rdp->cblist))
2474	invoke_rcu_core();
2475	}
2476
2477	/ Do any needed deferred wakeups of rcuo kthreads. /
2478	do_nocb_deferred_wakeup(rdp);
2479	trace_rcu_utilization(TPS("End RCU core"));
2480
2481	// If strict GPs, schedule an RCU reader in a clean environment.
2482	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
2483	queue_work_on(cpu: rdp->cpu, wq: rcu_gp_wq, work: &rdp->strict_work);
2484	}
2485
2486	static void rcu_core_si(struct softirq_action *h)
2487	{
2488	rcu_core();
2489	}
2490
2491	static void rcu_wake_cond(struct task_struct t, int* status)
2492	{
2493	/*
2494	* If the thread is yielding, only wake it when this
2495	* is invoked from idle
2496	*/
2497	if (t && (status != RCU_KTHREAD_YIELDING \|\| is_idle_task(current)))
2498	wake_up_process(tsk: t);
2499	}
2500
2501	static void invoke_rcu_core_kthread(void)
2502	{
2503	struct task_struct *t;
2504	unsigned long flags;
2505
2506	local_irq_save(flags);
2507	__this_cpu_write(rcu_data.rcu_cpu_has_work, `1`);
2508	t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
2509	if (t != NULL && t != current)
2510	rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
2511	local_irq_restore(flags);
2512	}
2513
2514	/*
2515	* Wake up this CPU's rcuc kthread to do RCU core processing.
2516	*/
2517	static void invoke_rcu_core(void)
2518	{
2519	if (!cpu_online(smp_processor_id()))
2520	return;
2521	if (use_softirq)
2522	raise_softirq(nr: RCU_SOFTIRQ);
2523	else
2524	invoke_rcu_core_kthread();
2525	}
2526
2527	static void rcu_cpu_kthread_park(unsigned int cpu)
2528	{
2529	per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
2530	}
2531
2532	static int rcu_cpu_kthread_should_run(unsigned int cpu)
2533	{
2534	return __this_cpu_read(rcu_data.rcu_cpu_has_work);
2535	}
2536
2537	/*
2538	* Per-CPU kernel thread that invokes RCU callbacks. This replaces
2539	* the RCU softirq used in configurations of RCU that do not support RCU
2540	* priority boosting.
2541	*/
2542	static void rcu_cpu_kthread(unsigned int cpu)
2543	{
2544	unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
2545	char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
2546	unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
2547	int spincnt;
2548
2549	trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
2550	for (spincnt = `0`; spincnt < `10`; spincnt++) {
2551	WRITE_ONCE(*j, jiffies);
2552	local_bh_disable();
2553	*statusp = RCU_KTHREAD_RUNNING;
2554	local_irq_disable();
2555	work = *workp;
2556	WRITE_ONCE(*workp, `0`);
2557	local_irq_enable();
2558	if (work)
2559	rcu_core();
2560	local_bh_enable();
2561	if (!READ_ONCE(*workp)) {
2562	trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
2563	*statusp = RCU_KTHREAD_WAITING;
2564	return;
2565	}
2566	}
2567	*statusp = RCU_KTHREAD_YIELDING;
2568	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
2569	schedule_timeout_idle(timeout: `2`);
2570	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
2571	*statusp = RCU_KTHREAD_WAITING;
2572	WRITE_ONCE(*j, jiffies);
2573	}
2574
2575	static struct smp_hotplug_thread rcu_cpu_thread_spec = {
2576	.store = &rcu_data.rcu_cpu_kthread_task,
2577	.thread_should_run = rcu_cpu_kthread_should_run,
2578	.thread_fn = rcu_cpu_kthread,
2579	.thread_comm = "rcuc/%u",
2580	.setup = rcu_cpu_kthread_setup,
2581	.park = rcu_cpu_kthread_park,
2582	};
2583
2584	/*
2585	* Spawn per-CPU RCU core processing kthreads.
2586	*/
2587	static int __init rcu_spawn_core_kthreads(void)
2588	{
2589	int cpu;
2590
2591	for_each_possible_cpu(cpu)
2592	per_cpu(rcu_data.rcu_cpu_has_work, cpu) = `0`;
2593	if (use_softirq)
2594	return `0`;
2595	WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
2596	"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
2597	return `0`;
2598	}
2599
2600	static void rcutree_enqueue(struct rcu_data rdp, struct* rcu_head *head, rcu_callback_t func)
2601	{
2602	rcu_segcblist_enqueue(rsclp: &rdp->cblist, rhp: head);
2603	if (__is_kvfree_rcu_offset((unsigned long)func))
2604	trace_rcu_kvfree_callback(rcuname: rcu_state.name, rhp: head,
2605	offset: (unsigned long)func,
2606	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist));
2607	else
2608	trace_rcu_callback(rcuname: rcu_state.name, rhp: head,
2609	qlen: rcu_segcblist_n_cbs(rsclp: &rdp->cblist));
2610	trace_rcu_segcb_stats(rs: &rdp->cblist, TPS("SegCBQueued"));
2611	}
2612
2613	/*
2614	* Handle any core-RCU processing required by a call_rcu() invocation.
2615	*/
2616	static void call_rcu_core(struct rcu_data rdp, struct* rcu_head *head,
2617	rcu_callback_t func, unsigned long flags)
2618	{
2619	rcutree_enqueue(rdp, head, func);
2620	/*
2621	* If called from an extended quiescent state, invoke the RCU
2622	* core in order to force a re-evaluation of RCU's idleness.
2623	*/
2624	if (!rcu_is_watching())
2625	invoke_rcu_core();
2626
2627	/ If interrupts were disabled or CPU offline, don't invoke RCU core. /
2628	if (irqs_disabled_flags(flags) \|\| cpu_is_offline(smp_processor_id()))
2629	return;
2630
2631	/*
2632	* Force the grace period if too many callbacks or too long waiting.
2633	* Enforce hysteresis, and don't invoke rcu_force_quiescent_state()
2634	* if some other CPU has recently done so. Also, don't bother
2635	* invoking rcu_force_quiescent_state() if the newly enqueued callback
2636	* is the only one waiting for a grace period to complete.
2637	*/
2638	if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
2639	rdp->qlen_last_fqs_check + qhimark)) {
2640
2641	/ Are we ignoring a completed grace period? /
2642	note_gp_changes(rdp);
2643
2644	/ Start a new grace period if one not already started. /
2645	if (!rcu_gp_in_progress()) {
2646	rcu_accelerate_cbs_unlocked(rnp: rdp->mynode, rdp);
2647	} else {
2648	/ Give the grace period a kick. /
2649	rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
2650	if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
2651	rcu_segcblist_first_pend_cb(rsclp: &rdp->cblist) != head)
2652	rcu_force_quiescent_state();
2653	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
2654	rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(rsclp: &rdp->cblist);
2655	}
2656	}
2657	}
2658
2659	/*
2660	* RCU callback function to leak a callback.
2661	*/
2662	static void rcu_leak_callback(struct rcu_head *rhp)
2663	{
2664	}
2665
2666	/*
2667	* Check and if necessary update the leaf rcu_node structure's
2668	* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2669	* number of queued RCU callbacks. The caller must hold the leaf rcu_node
2670	* structure's ->lock.
2671	*/
2672	static void check_cb_ovld_locked(struct rcu_data rdp, struct* rcu_node *rnp)
2673	{
2674	raw_lockdep_assert_held_rcu_node(rnp);
2675	if (qovld_calc <= `0`)
2676	return; // Early boot and wildcard value set.
2677	if (rcu_segcblist_n_cbs(rsclp: &rdp->cblist) >= qovld_calc)
2678	WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask \| rdp->grpmask);
2679	else
2680	WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask);
2681	}
2682
2683	/*
2684	* Check and if necessary update the leaf rcu_node structure's
2685	* ->cbovldmask bit corresponding to the current CPU based on that CPU's
2686	* number of queued RCU callbacks. No locks need be held, but the
2687	* caller must have disabled interrupts.
2688	*
2689	* Note that this function ignores the possibility that there are a lot
2690	* of callbacks all of which have already seen the end of their respective
2691	* grace periods. This omission is due to the need for no-CBs CPUs to
2692	* be holding ->nocb_lock to do this check, which is too heavy for a
2693	* common-case operation.
2694	*/
2695	static void check_cb_ovld(struct rcu_data *rdp)
2696	{
2697	struct rcu_node *const rnp = rdp->mynode;
2698
2699	if (qovld_calc <= `0` \|\|
2700	((rcu_segcblist_n_cbs(rsclp: &rdp->cblist) >= qovld_calc) ==
2701	!!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask)))
2702	return; // Early boot wildcard value or already set correctly.
2703	raw_spin_lock_rcu_node(rnp);
2704	check_cb_ovld_locked(rdp, rnp);
2705	raw_spin_unlock_rcu_node(rnp);
2706	}
2707
2708	static void
2709	__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
2710	{
2711	static atomic_t doublefrees;
2712	unsigned long flags;
2713	bool lazy;
2714	struct rcu_data *rdp;
2715
2716	/ Misaligned rcu_head! /
2717	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - `1`));
2718
2719	if (debug_rcu_head_queue(head)) {
2720	/*
2721	* Probable double call_rcu(), so leak the callback.
2722	* Use rcu:rcu_callback trace event to find the previous
2723	* time callback was passed to call_rcu().
2724	*/
2725	if (atomic_inc_return(v: &doublefrees) < `4`) {
2726	pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
2727	mem_dump_obj(object: head);
2728	}
2729	WRITE_ONCE(head->func, rcu_leak_callback);
2730	return;
2731	}
2732	head->func = func;
2733	head->next = NULL;
2734	kasan_record_aux_stack_noalloc(ptr: head);
2735	local_irq_save(flags);
2736	rdp = this_cpu_ptr(&rcu_data);
2737	lazy = lazy_in && !rcu_async_should_hurry();
2738
2739	/ Add the callback to our list. /
2740	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
2741	// This can trigger due to call_rcu() from offline CPU:
2742	WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
2743	WARN_ON_ONCE(!rcu_is_watching());
2744	// Very early boot, before rcu_init(). Initialize if needed
2745	// and then drop through to queue the callback.
2746	if (rcu_segcblist_empty(rsclp: &rdp->cblist))
2747	rcu_segcblist_init(rsclp: &rdp->cblist);
2748	}
2749
2750	check_cb_ovld(rdp);
2751
2752	if (unlikely(rcu_rdp_is_offloaded(rdp)))
2753	call_rcu_nocb(rdp, head, func, flags, lazy);
2754	else
2755	call_rcu_core(rdp, head, func, flags);
2756	local_irq_restore(flags);
2757	}
2758
2759	#ifdef CONFIG_RCU_LAZY
2760	static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF);
2761	module_param(enable_rcu_lazy, bool, `0444`);
2762
2763	/**
2764	* call_rcu_hurry() - Queue RCU callback for invocation after grace period, and
2765	* flush all lazy callbacks (including the new one) to the main ->cblist while
2766	* doing so.
2767	*
2768	* @head: structure to be used for queueing the RCU updates.
2769	* @func: actual callback function to be invoked after the grace period
2770	*
2771	* The callback function will be invoked some time after a full grace
2772	* period elapses, in other words after all pre-existing RCU read-side
2773	* critical sections have completed.
2774	*
2775	* Use this API instead of call_rcu() if you don't want the callback to be
2776	* invoked after very long periods of time, which can happen on systems without
2777	* memory pressure and on systems which are lightly loaded or mostly idle.
2778	* This function will cause callbacks to be invoked sooner than later at the
2779	* expense of extra power. Other than that, this function is identical to, and
2780	* reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory
2781	* ordering and other functionality.
2782	*/
2783	void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
2784	{
2785	__call_rcu_common(head, func, lazy_in: false);
2786	}
2787	EXPORT_SYMBOL_GPL(call_rcu_hurry);
2788	#else
2789	#define enable_rcu_lazy false
2790	#endif
2791
2792	/**
2793	* call_rcu() - Queue an RCU callback for invocation after a grace period.
2794	* By default the callbacks are 'lazy' and are kept hidden from the main
2795	* ->cblist to prevent starting of grace periods too soon.
2796	* If you desire grace periods to start very soon, use call_rcu_hurry().
2797	*
2798	* @head: structure to be used for queueing the RCU updates.
2799	* @func: actual callback function to be invoked after the grace period
2800	*
2801	* The callback function will be invoked some time after a full grace
2802	* period elapses, in other words after all pre-existing RCU read-side
2803	* critical sections have completed. However, the callback function
2804	* might well execute concurrently with RCU read-side critical sections
2805	* that started after call_rcu() was invoked.
2806	*
2807	* RCU read-side critical sections are delimited by rcu_read_lock()
2808	* and rcu_read_unlock(), and may be nested. In addition, but only in
2809	* v5.0 and later, regions of code across which interrupts, preemption,
2810	* or softirqs have been disabled also serve as RCU read-side critical
2811	* sections. This includes hardware interrupt handlers, softirq handlers,
2812	* and NMI handlers.
2813	*
2814	* Note that all CPUs must agree that the grace period extended beyond
2815	* all pre-existing RCU read-side critical section. On systems with more
2816	* than one CPU, this means that when "func()" is invoked, each CPU is
2817	* guaranteed to have executed a full memory barrier since the end of its
2818	* last RCU read-side critical section whose beginning preceded the call
2819	* to call_rcu(). It also means that each CPU executing an RCU read-side
2820	* critical section that continues beyond the start of "func()" must have
2821	* executed a memory barrier after the call_rcu() but before the beginning
2822	* of that RCU read-side critical section. Note that these guarantees
2823	* include CPUs that are offline, idle, or executing in user mode, as
2824	* well as CPUs that are executing in the kernel.
2825	*
2826	* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
2827	* resulting RCU callback function "func()", then both CPU A and CPU B are
2828	* guaranteed to execute a full memory barrier during the time interval
2829	* between the call to call_rcu() and the invocation of "func()" -- even
2830	* if CPU A and CPU B are the same CPU (but again only if the system has
2831	* more than one CPU).
2832	*
2833	* Implementation of these memory-ordering guarantees is described here:
2834	* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
2835	*/
2836	void call_rcu(struct rcu_head *head, rcu_callback_t func)
2837	{
2838	__call_rcu_common(head, func, lazy_in: enable_rcu_lazy);
2839	}
2840	EXPORT_SYMBOL_GPL(call_rcu);
2841
2842	/ Maximum number of jiffies to wait before draining a batch. /
2843	#define KFREE_DRAIN_JIFFIES (5 * HZ)
2844	#define KFREE_N_BATCHES 2
2845	#define FREE_N_CHANNELS 2
2846
2847	/**
2848	* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
2849	* @list: List node. All blocks are linked between each other
2850	* @gp_snap: Snapshot of RCU state for objects placed to this bulk
2851	* @nr_records: Number of active pointers in the array
2852	* @records: Array of the kvfree_rcu() pointers
2853	*/
2854	struct kvfree_rcu_bulk_data {
2855	struct list_head list;
2856	struct rcu_gp_oldstate gp_snap;
2857	unsigned long nr_records;
2858	void *records[];
2859	};
2860
2861	/*
2862	* This macro defines how many entries the "records" array
2863	* will contain. It is based on the fact that the size of
2864	* kvfree_rcu_bulk_data structure becomes exactly one page.
2865	*/
2866	#define KVFREE_BULK_MAX_ENTR \
2867	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
2868
2869	/**
2870	* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
2871	* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
2872	* @head_free: List of kfree_rcu() objects waiting for a grace period
2873	* @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
2874	* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
2875	* @krcp: Pointer to @kfree_rcu_cpu structure
2876	*/
2877
2878	struct kfree_rcu_cpu_work {
2879	struct rcu_work rcu_work;
2880	struct rcu_head *head_free;
2881	struct rcu_gp_oldstate head_free_gp_snap;
2882	struct list_head bulk_head_free[FREE_N_CHANNELS];
2883	struct kfree_rcu_cpu *krcp;
2884	};
2885
2886	/**
2887	* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
2888	* @head: List of kfree_rcu() objects not yet waiting for a grace period
2889	* @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
2890	* @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
2891	* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
2892	* @lock: Synchronize access to this structure
2893	* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
2894	* @initialized: The @rcu_work fields have been initialized
2895	* @head_count: Number of objects in rcu_head singular list
2896	* @bulk_count: Number of objects in bulk-list
2897	* @bkvcache:
2898	* A simple cache list that contains objects for reuse purpose.
2899	* In order to save some per-cpu space the list is singular.
2900	* Even though it is lockless an access has to be protected by the
2901	* per-cpu lock.
2902	* @page_cache_work: A work to refill the cache when it is empty
2903	* @backoff_page_cache_fill: Delay cache refills
2904	* @work_in_progress: Indicates that page_cache_work is running
2905	* @hrtimer: A hrtimer for scheduling a page_cache_work
2906	* @nr_bkv_objs: number of allocated objects at @bkvcache.
2907	*
2908	* This is a per-CPU structure. The reason that it is not included in
2909	* the rcu_data structure is to permit this code to be extracted from
2910	* the RCU files. Such extraction could allow further optimization of
2911	* the interactions with the slab allocators.
2912	*/
2913	struct kfree_rcu_cpu {
2914	// Objects queued on a linked list
2915	// through their rcu_head structures.
2916	struct rcu_head *head;
2917	unsigned long head_gp_snap;
2918	atomic_t head_count;
2919
2920	// Objects queued on a bulk-list.
2921	struct list_head bulk_head[FREE_N_CHANNELS];
2922	atomic_t bulk_count[FREE_N_CHANNELS];
2923
2924	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
2925	raw_spinlock_t lock;
2926	struct delayed_work monitor_work;
2927	bool initialized;
2928
2929	struct delayed_work page_cache_work;
2930	atomic_t backoff_page_cache_fill;
2931	atomic_t work_in_progress;
2932	struct hrtimer hrtimer;
2933
2934	struct llist_head bkvcache;
2935	int nr_bkv_objs;
2936	};
2937
2938	static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
2939	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
2940	};
2941
2942	static __always_inline void
2943	debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
2944	{
2945	#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
2946	int i;
2947
2948	for (i = `0`; i < bhead->nr_records; i++)
2949	debug_rcu_head_unqueue(head: (struct rcu_head *)(bhead->records[i]));
2950	#endif
2951	}
2952
2953	static inline struct kfree_rcu_cpu *
2954	krc_this_cpu_lock(unsigned long *flags)
2955	{
2956	struct kfree_rcu_cpu *krcp;
2957
2958	local_irq_save(flags); // For safely calling this_cpu_ptr().*
2959	krcp = this_cpu_ptr(&krc);
2960	raw_spin_lock(&krcp->lock);
2961
2962	return krcp;
2963	}
2964
2965	static inline void
2966	krc_this_cpu_unlock(struct kfree_rcu_cpu krcp, unsigned* long flags)
2967	{
2968	raw_spin_unlock_irqrestore(&krcp->lock, flags);
2969	}
2970
2971	static inline struct kvfree_rcu_bulk_data *
2972	get_cached_bnode(struct kfree_rcu_cpu *krcp)
2973	{
2974	if (!krcp->nr_bkv_objs)
2975	return NULL;
2976
2977	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - `1`);
2978	return (struct kvfree_rcu_bulk_data *)
2979	llist_del_first(head: &krcp->bkvcache);
2980	}
2981
2982	static inline bool
2983	put_cached_bnode(struct kfree_rcu_cpu *krcp,
2984	struct kvfree_rcu_bulk_data *bnode)
2985	{
2986	// Check the limit.
2987	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
2988	return false;
2989
2990	llist_add(new: (struct llist_node *) bnode, head: &krcp->bkvcache);
2991	WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + `1`);
2992	return true;
2993	}
2994
2995	static int
2996	drain_page_cache(struct kfree_rcu_cpu *krcp)
2997	{
2998	unsigned long flags;
2999	struct llist_node page_list, pos, *n;
3000	int freed = `0`;
3001
3002	if (!rcu_min_cached_objs)
3003	return `0`;
3004
3005	raw_spin_lock_irqsave(&krcp->lock, flags);
3006	page_list = llist_del_all(head: &krcp->bkvcache);
3007	WRITE_ONCE(krcp->nr_bkv_objs, `0`);
3008	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3009
3010	llist_for_each_safe(pos, n, page_list) {
3011	free_page((unsigned long)pos);
3012	freed++;
3013	}
3014
3015	return freed;
3016	}
3017
3018	static void
3019	kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
3020	struct kvfree_rcu_bulk_data bnode, int* idx)
3021	{
3022	unsigned long flags;
3023	int i;
3024
3025	if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
3026	debug_rcu_bhead_unqueue(bhead: bnode);
3027	rcu_lock_acquire(map: &rcu_callback_map);
3028	if (idx == `0`) { // kmalloc() / kfree().
3029	trace_rcu_invoke_kfree_bulk_callback(
3030	rcuname: rcu_state.name, nr_records: bnode->nr_records,
3031	p: bnode->records);
3032
3033	kfree_bulk(size: bnode->nr_records, p: bnode->records);
3034	} else { // vmalloc() / vfree().
3035	for (i = `0`; i < bnode->nr_records; i++) {
3036	trace_rcu_invoke_kvfree_callback(
3037	rcuname: rcu_state.name, rhp: bnode->records[i], offset: `0`);
3038
3039	vfree(addr: bnode->records[i]);
3040	}
3041	}
3042	rcu_lock_release(map: &rcu_callback_map);
3043	}
3044
3045	raw_spin_lock_irqsave(&krcp->lock, flags);
3046	if (put_cached_bnode(krcp, bnode))
3047	bnode = NULL;
3048	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3049
3050	if (bnode)
3051	free_page((unsigned long) bnode);
3052
3053	cond_resched_tasks_rcu_qs();
3054	}
3055
3056	static void
3057	kvfree_rcu_list(struct rcu_head *head)
3058	{
3059	struct rcu_head *next;
3060
3061	for (; head; head = next) {
3062	void ptr = (void* *) head->func;
3063	unsigned long offset = (void *) head - ptr;
3064
3065	next = head->next;
3066	debug_rcu_head_unqueue(head: (struct rcu_head *)ptr);
3067	rcu_lock_acquire(map: &rcu_callback_map);
3068	trace_rcu_invoke_kvfree_callback(rcuname: rcu_state.name, rhp: head, offset);
3069
3070	if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
3071	kvfree(addr: ptr);
3072
3073	rcu_lock_release(map: &rcu_callback_map);
3074	cond_resched_tasks_rcu_qs();
3075	}
3076	}
3077
3078	/*
3079	* This function is invoked in workqueue context after a grace period.
3080	* It frees all the objects queued on ->bulk_head_free or ->head_free.
3081	*/
3082	static void kfree_rcu_work(struct work_struct *work)
3083	{
3084	unsigned long flags;
3085	struct kvfree_rcu_bulk_data bnode, n;
3086	struct list_head bulk_head[FREE_N_CHANNELS];
3087	struct rcu_head *head;
3088	struct kfree_rcu_cpu *krcp;
3089	struct kfree_rcu_cpu_work *krwp;
3090	struct rcu_gp_oldstate head_gp_snap;
3091	int i;
3092
3093	krwp = container_of(to_rcu_work(work),
3094	struct kfree_rcu_cpu_work, rcu_work);
3095	krcp = krwp->krcp;
3096
3097	raw_spin_lock_irqsave(&krcp->lock, flags);
3098	// Channels 1 and 2.
3099	for (i = `0`; i < FREE_N_CHANNELS; i++)
3100	list_replace_init(old: &krwp->bulk_head_free[i], new: &bulk_head[i]);
3101
3102	// Channel 3.
3103	head = krwp->head_free;
3104	krwp->head_free = NULL;
3105	head_gp_snap = krwp->head_free_gp_snap;
3106	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3107
3108	// Handle the first two channels.
3109	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3110	// Start from the tail page, so a GP is likely passed for it.
3111	list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
3112	kvfree_rcu_bulk(krcp, bnode, idx: i);
3113	}
3114
3115	/*
3116	* This is used when the "bulk" path can not be used for the
3117	* double-argument of kvfree_rcu(). This happens when the
3118	* page-cache is empty, which means that objects are instead
3119	* queued on a linked list through their rcu_head structures.
3120	* This list is named "Channel 3".
3121	*/
3122	if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
3123	kvfree_rcu_list(head);
3124	}
3125
3126	static bool
3127	need_offload_krc(struct kfree_rcu_cpu *krcp)
3128	{
3129	int i;
3130
3131	for (i = `0`; i < FREE_N_CHANNELS; i++)
3132	if (!list_empty(head: &krcp->bulk_head[i]))
3133	return true;
3134
3135	return !!READ_ONCE(krcp->head);
3136	}
3137
3138	static bool
3139	need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
3140	{
3141	int i;
3142
3143	for (i = `0`; i < FREE_N_CHANNELS; i++)
3144	if (!list_empty(head: &krwp->bulk_head_free[i]))
3145	return true;
3146
3147	return !!krwp->head_free;
3148	}
3149
3150	static int krc_count(struct kfree_rcu_cpu *krcp)
3151	{
3152	int sum = atomic_read(v: &krcp->head_count);
3153	int i;
3154
3155	for (i = `0`; i < FREE_N_CHANNELS; i++)
3156	sum += atomic_read(v: &krcp->bulk_count[i]);
3157
3158	return sum;
3159	}
3160
3161	static void
3162	schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
3163	{
3164	long delay, delay_left;
3165
3166	delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? `1`:KFREE_DRAIN_JIFFIES;
3167	if (delayed_work_pending(&krcp->monitor_work)) {
3168	delay_left = krcp->monitor_work.timer.expires - jiffies;
3169	if (delay < delay_left)
3170	mod_delayed_work(wq: system_wq, dwork: &krcp->monitor_work, delay);
3171	return;
3172	}
3173	queue_delayed_work(wq: system_wq, dwork: &krcp->monitor_work, delay);
3174	}
3175
3176	static void
3177	kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
3178	{
3179	struct list_head bulk_ready[FREE_N_CHANNELS];
3180	struct kvfree_rcu_bulk_data bnode, n;
3181	struct rcu_head *head_ready = NULL;
3182	unsigned long flags;
3183	int i;
3184
3185	raw_spin_lock_irqsave(&krcp->lock, flags);
3186	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3187	INIT_LIST_HEAD(list: &bulk_ready[i]);
3188
3189	list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
3190	if (!poll_state_synchronize_rcu_full(rgosp: &bnode->gp_snap))
3191	break;
3192
3193	atomic_sub(i: bnode->nr_records, v: &krcp->bulk_count[i]);
3194	list_move(list: &bnode->list, head: &bulk_ready[i]);
3195	}
3196	}
3197
3198	if (krcp->head && poll_state_synchronize_rcu(oldstate: krcp->head_gp_snap)) {
3199	head_ready = krcp->head;
3200	atomic_set(v: &krcp->head_count, i: `0`);
3201	WRITE_ONCE(krcp->head, NULL);
3202	}
3203	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3204
3205	for (i = `0`; i < FREE_N_CHANNELS; i++) {
3206	list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
3207	kvfree_rcu_bulk(krcp, bnode, idx: i);
3208	}
3209
3210	if (head_ready)
3211	kvfree_rcu_list(head: head_ready);
3212	}
3213
3214	/*
3215	* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3216	*/
3217	static void kfree_rcu_monitor(struct work_struct *work)
3218	{
3219	struct kfree_rcu_cpu *krcp = container_of(work,
3220	struct kfree_rcu_cpu, monitor_work.work);
3221	unsigned long flags;
3222	int i, j;
3223
3224	// Drain ready for reclaim.
3225	kvfree_rcu_drain_ready(krcp);
3226
3227	raw_spin_lock_irqsave(&krcp->lock, flags);
3228
3229	// Attempt to start a new batch.
3230	for (i = `0`; i < KFREE_N_BATCHES; i++) {
3231	struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
3232
3233	// Try to detach bulk_head or head and attach it, only when
3234	// all channels are free. Any channel is not free means at krwp
3235	// there is on-going rcu work to handle krwp's free business.
3236	if (need_wait_for_krwp_work(krwp))
3237	continue;
3238
3239	// kvfree_rcu_drain_ready() might handle this krcp, if so give up.
3240	if (need_offload_krc(krcp)) {
3241	// Channel 1 corresponds to the SLAB-pointer bulk path.
3242	// Channel 2 corresponds to vmalloc-pointer bulk path.
3243	for (j = `0`; j < FREE_N_CHANNELS; j++) {
3244	if (list_empty(head: &krwp->bulk_head_free[j])) {
3245	atomic_set(v: &krcp->bulk_count[j], i: `0`);
3246	list_replace_init(old: &krcp->bulk_head[j],
3247	new: &krwp->bulk_head_free[j]);
3248	}
3249	}
3250
3251	// Channel 3 corresponds to both SLAB and vmalloc
3252	// objects queued on the linked list.
3253	if (!krwp->head_free) {
3254	krwp->head_free = krcp->head;
3255	get_state_synchronize_rcu_full(rgosp: &krwp->head_free_gp_snap);
3256	atomic_set(v: &krcp->head_count, i: `0`);
3257	WRITE_ONCE(krcp->head, NULL);
3258	}
3259
3260	// One work is per one batch, so there are three
3261	// "free channels", the batch can handle. It can
3262	// be that the work is in the pending state when
3263	// channels have been detached following by each
3264	// other.
3265	queue_rcu_work(wq: system_wq, rwork: &krwp->rcu_work);
3266	}
3267	}
3268
3269	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3270
3271	// If there is nothing to detach, it means that our job is
3272	// successfully done here. In case of having at least one
3273	// of the channels that is still busy we should rearm the
3274	// work to repeat an attempt. Because previous batches are
3275	// still in progress.
3276	if (need_offload_krc(krcp))
3277	schedule_delayed_monitor_work(krcp);
3278	}
3279
3280	static enum hrtimer_restart
3281	schedule_page_work_fn(struct hrtimer *t)
3282	{
3283	struct kfree_rcu_cpu *krcp =
3284	container_of(t, struct kfree_rcu_cpu, hrtimer);
3285
3286	queue_delayed_work(wq: system_highpri_wq, dwork: &krcp->page_cache_work, delay: `0`);
3287	return HRTIMER_NORESTART;
3288	}
3289
3290	static void fill_page_cache_func(struct work_struct *work)
3291	{
3292	struct kvfree_rcu_bulk_data *bnode;
3293	struct kfree_rcu_cpu *krcp =
3294	container_of(work, struct kfree_rcu_cpu,
3295	page_cache_work.work);
3296	unsigned long flags;
3297	int nr_pages;
3298	bool pushed;
3299	int i;
3300
3301	nr_pages = atomic_read(v: &krcp->backoff_page_cache_fill) ?
3302	`1` : rcu_min_cached_objs;
3303
3304	for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
3305	bnode = (struct kvfree_rcu_bulk_data *)
3306	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3307
3308	if (!bnode)
3309	break;
3310
3311	raw_spin_lock_irqsave(&krcp->lock, flags);
3312	pushed = put_cached_bnode(krcp, bnode);
3313	raw_spin_unlock_irqrestore(&krcp->lock, flags);
3314
3315	if (!pushed) {
3316	free_page((unsigned long) bnode);
3317	break;
3318	}
3319	}
3320
3321	atomic_set(v: &krcp->work_in_progress, i: `0`);
3322	atomic_set(v: &krcp->backoff_page_cache_fill, i: `0`);
3323	}
3324
3325	static void
3326	run_page_cache_worker(struct kfree_rcu_cpu *krcp)
3327	{
3328	// If cache disabled, bail out.
3329	if (!rcu_min_cached_objs)
3330	return;
3331
3332	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
3333	!atomic_xchg(v: &krcp->work_in_progress, new: `1`)) {
3334	if (atomic_read(v: &krcp->backoff_page_cache_fill)) {
3335	queue_delayed_work(wq: system_wq,
3336	dwork: &krcp->page_cache_work,
3337	delay: msecs_to_jiffies(m: rcu_delay_page_cache_fill_msec));
3338	} else {
3339	hrtimer_init(timer: &krcp->hrtimer, CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL);
3340	krcp->hrtimer.function = schedule_page_work_fn;
3341	hrtimer_start(timer: &krcp->hrtimer, tim: `0`, mode: HRTIMER_MODE_REL);
3342	}
3343	}
3344	}
3345
3346	// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
3347	// state specified by flags. If can_alloc is true, the caller must
3348	// be schedulable and not be holding any locks or mutexes that might be
3349	// acquired by the memory allocator or anything that it might invoke.
3350	// Returns true if ptr was successfully recorded, else the caller must
3351	// use a fallback.
3352	static inline bool
3353	add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
3354	unsigned long flags, void* *ptr, bool can_alloc)
3355	{
3356	struct kvfree_rcu_bulk_data *bnode;
3357	int idx;
3358
3359	*krcp = krc_this_cpu_lock(flags);
3360	if (unlikely(!(*krcp)->initialized))
3361	return false;
3362
3363	idx = !!is_vmalloc_addr(x: ptr);
3364	bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
3365	struct kvfree_rcu_bulk_data, list);
3366
3367	/ Check if a new block is required. /
3368	if (!bnode \|\| bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
3369	bnode = get_cached_bnode(krcp: *krcp);
3370	if (!bnode && can_alloc) {
3371	krc_this_cpu_unlock(krcp: krcp, flags: flags);
3372
3373	// __GFP_NORETRY - allows a light-weight direct reclaim
3374	// what is OK from minimizing of fallback hitting point of
3375	// view. Apart of that it forbids any OOM invoking what is
3376	// also beneficial since we are about to release memory soon.
3377	//
3378	// __GFP_NOMEMALLOC - prevents from consuming of all the
3379	// memory reserves. Please note we have a fallback path.
3380	//
3381	// __GFP_NOWARN - it is supposed that an allocation can
3382	// be failed under low memory or high memory pressure
3383	// scenarios.
3384	bnode = (struct kvfree_rcu_bulk_data *)
3385	__get_free_page(GFP_KERNEL \| __GFP_NORETRY \| __GFP_NOMEMALLOC \| __GFP_NOWARN);
3386	raw_spin_lock_irqsave(&(krcp)->lock, flags);
3387	}
3388
3389	if (!bnode)
3390	return false;
3391
3392	// Initialize the new block and attach it.
3393	bnode->nr_records = `0`;
3394	list_add(new: &bnode->list, head: &(*krcp)->bulk_head[idx]);
3395	}
3396
3397	// Finally insert and update the GP for this page.
3398	bnode->records[bnode->nr_records++] = ptr;
3399	get_state_synchronize_rcu_full(rgosp: &bnode->gp_snap);
3400	atomic_inc(v: &(*krcp)->bulk_count[idx]);
3401
3402	return true;
3403	}
3404
3405	/*
3406	* Queue a request for lazy invocation of the appropriate free routine
3407	* after a grace period. Please note that three paths are maintained,
3408	* two for the common case using arrays of pointers and a third one that
3409	* is used only when the main paths cannot be used, for example, due to
3410	* memory pressure.
3411	*
3412	* Each kvfree_call_rcu() request is added to a batch. The batch will be drained
3413	* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
3414	* be free'd in workqueue context. This allows us to: batch requests together to
3415	* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
3416	*/
3417	void kvfree_call_rcu(struct rcu_head head, void* *ptr)
3418	{
3419	unsigned long flags;
3420	struct kfree_rcu_cpu *krcp;
3421	bool success;
3422
3423	/*
3424	* Please note there is a limitation for the head-less
3425	* variant, that is why there is a clear rule for such
3426	* objects: it can be used from might_sleep() context
3427	* only. For other places please embed an rcu_head to
3428	* your data.
3429	*/
3430	if (!head)
3431	might_sleep();
3432
3433	// Queue the object but don't yet schedule the batch.
3434	if (debug_rcu_head_queue(head: ptr)) {
3435	// Probable double kfree_rcu(), just leak.
3436	WARN_ONCE(`1`, "%s(): Double-freed call. rcu_head %p\n",
3437	__func__, head);
3438
3439	// Mark as success and leave.
3440	return;
3441	}
3442
3443	kasan_record_aux_stack_noalloc(ptr);
3444	success = add_ptr_to_bulk_krc_lock(krcp: &krcp, flags: &flags, ptr, can_alloc: !head);
3445	if (!success) {
3446	run_page_cache_worker(krcp);
3447
3448	if (head == NULL)
3449	// Inline if kvfree_rcu(one_arg) call.
3450	goto unlock_return;
3451
3452	head->func = ptr;
3453	head->next = krcp->head;
3454	WRITE_ONCE(krcp->head, head);
3455	atomic_inc(v: &krcp->head_count);
3456
3457	// Take a snapshot for this krcp.
3458	krcp->head_gp_snap = get_state_synchronize_rcu();
3459	success = true;
3460	}
3461
3462	/*
3463	* The kvfree_rcu() caller considers the pointer freed at this point
3464	* and likely removes any references to it. Since the actual slab
3465	* freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
3466	* this object (no scanning or false positives reporting).
3467	*/
3468	kmemleak_ignore(ptr);
3469
3470	// Set timer to drain after KFREE_DRAIN_JIFFIES.
3471	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
3472	schedule_delayed_monitor_work(krcp);
3473
3474	unlock_return:
3475	krc_this_cpu_unlock(krcp, flags);
3476
3477	/*
3478	* Inline kvfree() after synchronize_rcu(). We can do
3479	* it from might_sleep() context only, so the current
3480	* CPU can pass the QS state.
3481	*/
3482	if (!success) {
3483	debug_rcu_head_unqueue(head: (struct rcu_head *) ptr);
3484	synchronize_rcu();
3485	kvfree(addr: ptr);
3486	}
3487	}
3488	EXPORT_SYMBOL_GPL(kvfree_call_rcu);
3489
3490	static unsigned long
3491	kfree_rcu_shrink_count(struct shrinker shrink, struct* shrink_control *sc)
3492	{
3493	int cpu;
3494	unsigned long count = `0`;
3495
3496	/ Snapshot count of all CPUs /
3497	for_each_possible_cpu(cpu) {
3498	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3499
3500	count += krc_count(krcp);
3501	count += READ_ONCE(krcp->nr_bkv_objs);
3502	atomic_set(v: &krcp->backoff_page_cache_fill, i: `1`);
3503	}
3504
3505	return count == `0` ? SHRINK_EMPTY : count;
3506	}
3507
3508	static unsigned long
3509	kfree_rcu_shrink_scan(struct shrinker shrink, struct* shrink_control *sc)
3510	{
3511	int cpu, freed = `0`;
3512
3513	for_each_possible_cpu(cpu) {
3514	int count;
3515	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3516
3517	count = krc_count(krcp);
3518	count += drain_page_cache(krcp);
3519	kfree_rcu_monitor(work: &krcp->monitor_work.work);
3520
3521	sc->nr_to_scan -= count;
3522	freed += count;
3523
3524	if (sc->nr_to_scan <= `0`)
3525	break;
3526	}
3527
3528	return freed == `0` ? SHRINK_STOP : freed;
3529	}
3530
3531	void __init kfree_rcu_scheduler_running(void)
3532	{
3533	int cpu;
3534
3535	for_each_possible_cpu(cpu) {
3536	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
3537
3538	if (need_offload_krc(krcp))
3539	schedule_delayed_monitor_work(krcp);
3540	}
3541	}
3542
3543	/*
3544	* During early boot, any blocking grace-period wait automatically
3545	* implies a grace period.
3546	*
3547	* Later on, this could in theory be the case for kernels built with
3548	* CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
3549	* is not a common case. Furthermore, this optimization would cause
3550	* the rcu_gp_oldstate structure to expand by 50%, so this potential
3551	* grace-period optimization is ignored once the scheduler is running.
3552	*/
3553	static int rcu_blocking_is_gp(void)
3554	{
3555	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) {
3556	might_sleep();
3557	return false;
3558	}
3559	return true;
3560	}
3561
3562	/**
3563	* synchronize_rcu - wait until a grace period has elapsed.
3564	*
3565	* Control will return to the caller some time after a full grace
3566	* period has elapsed, in other words after all currently executing RCU
3567	* read-side critical sections have completed. Note, however, that
3568	* upon return from synchronize_rcu(), the caller might well be executing
3569	* concurrently with new RCU read-side critical sections that began while
3570	* synchronize_rcu() was waiting.
3571	*
3572	* RCU read-side critical sections are delimited by rcu_read_lock()
3573	* and rcu_read_unlock(), and may be nested. In addition, but only in
3574	* v5.0 and later, regions of code across which interrupts, preemption,
3575	* or softirqs have been disabled also serve as RCU read-side critical
3576	* sections. This includes hardware interrupt handlers, softirq handlers,
3577	* and NMI handlers.
3578	*
3579	* Note that this guarantee implies further memory-ordering guarantees.
3580	* On systems with more than one CPU, when synchronize_rcu() returns,
3581	* each CPU is guaranteed to have executed a full memory barrier since
3582	* the end of its last RCU read-side critical section whose beginning
3583	* preceded the call to synchronize_rcu(). In addition, each CPU having
3584	* an RCU read-side critical section that extends beyond the return from
3585	* synchronize_rcu() is guaranteed to have executed a full memory barrier
3586	* after the beginning of synchronize_rcu() and before the beginning of
3587	* that RCU read-side critical section. Note that these guarantees include
3588	* CPUs that are offline, idle, or executing in user mode, as well as CPUs
3589	* that are executing in the kernel.
3590	*
3591	* Furthermore, if CPU A invoked synchronize_rcu(), which returned
3592	* to its caller on CPU B, then both CPU A and CPU B are guaranteed
3593	* to have executed a full memory barrier during the execution of
3594	* synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but
3595	* again only if the system has more than one CPU).
3596	*
3597	* Implementation of these memory-ordering guarantees is described here:
3598	* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
3599	*/
3600	void synchronize_rcu(void)
3601	{
3602	unsigned long flags;
3603	struct rcu_node *rnp;
3604
3605	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) \|\|
3606	lock_is_held(&rcu_lock_map) \|\|
3607	lock_is_held(&rcu_sched_lock_map),
3608	"Illegal synchronize_rcu() in RCU read-side critical section");
3609	if (!rcu_blocking_is_gp()) {
3610	if (rcu_gp_is_expedited())
3611	synchronize_rcu_expedited();
3612	else
3613	wait_rcu_gp(call_rcu_hurry);
3614	return;
3615	}
3616
3617	// Context allows vacuous grace periods.
3618	// Note well that this code runs with !PREEMPT && !SMP.
3619	// In addition, all code that advances grace periods runs at
3620	// process level. Therefore, this normal GP overlaps with other
3621	// normal GPs only by being fully nested within them, which allows
3622	// reuse of ->gp_seq_polled_snap.
3623	rcu_poll_gp_seq_start_unlocked(snap: &rcu_state.gp_seq_polled_snap);
3624	rcu_poll_gp_seq_end_unlocked(snap: &rcu_state.gp_seq_polled_snap);
3625
3626	// Update the normal grace-period counters to record
3627	// this grace period, but only those used by the boot CPU.
3628	// The rcu_scheduler_starting() will take care of the rest of
3629	// these counters.
3630	local_irq_save(flags);
3631	WARN_ON_ONCE(num_online_cpus() > `1`);
3632	rcu_state.gp_seq += (`1` << RCU_SEQ_CTR_SHIFT);
3633	for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
3634	rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
3635	local_irq_restore(flags);
3636	}
3637	EXPORT_SYMBOL_GPL(synchronize_rcu);
3638
3639	/**
3640	* get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
3641	* @rgosp: Place to put state cookie
3642	*
3643	* Stores into @rgosp a value that will always be treated by functions
3644	* like poll_state_synchronize_rcu_full() as a cookie whose grace period
3645	* has already completed.
3646	*/
3647	void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3648	{
3649	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
3650	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
3651	}
3652	EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
3653
3654	/**
3655	* get_state_synchronize_rcu - Snapshot current RCU state
3656	*
3657	* Returns a cookie that is used by a later call to cond_synchronize_rcu()
3658	* or poll_state_synchronize_rcu() to determine whether or not a full
3659	* grace period has elapsed in the meantime.
3660	*/
3661	unsigned long get_state_synchronize_rcu(void)
3662	{
3663	/*
3664	* Any prior manipulation of RCU-protected data must happen
3665	* before the load from ->gp_seq.
3666	*/
3667	smp_mb(); / ^^^ /
3668	return rcu_seq_snap(sp: &rcu_state.gp_seq_polled);
3669	}
3670	EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
3671
3672	/**
3673	* get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
3674	* @rgosp: location to place combined normal/expedited grace-period state
3675	*
3676	* Places the normal and expedited grace-period states in @rgosp. This
3677	* state value can be passed to a later call to cond_synchronize_rcu_full()
3678	* or poll_state_synchronize_rcu_full() to determine whether or not a
3679	* grace period (whether normal or expedited) has elapsed in the meantime.
3680	* The rcu_gp_oldstate structure takes up twice the memory of an unsigned
3681	* long, but is guaranteed to see all grace periods. In contrast, the
3682	* combined state occupies less memory, but can sometimes fail to take
3683	* grace periods into account.
3684	*
3685	* This does not guarantee that the needed grace period will actually
3686	* start.
3687	*/
3688	void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3689	{
3690	struct rcu_node *rnp = rcu_get_root();
3691
3692	/*
3693	* Any prior manipulation of RCU-protected data must happen
3694	* before the loads from ->gp_seq and ->expedited_sequence.
3695	*/
3696	smp_mb(); / ^^^ /
3697	rgosp->rgos_norm = rcu_seq_snap(sp: &rnp->gp_seq);
3698	rgosp->rgos_exp = rcu_seq_snap(sp: &rcu_state.expedited_sequence);
3699	}
3700	EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
3701
3702	/*
3703	* Helper function for start_poll_synchronize_rcu() and
3704	* start_poll_synchronize_rcu_full().
3705	*/
3706	static void start_poll_synchronize_rcu_common(void)
3707	{
3708	unsigned long flags;
3709	bool needwake;
3710	struct rcu_data *rdp;
3711	struct rcu_node *rnp;
3712
3713	lockdep_assert_irqs_enabled();
3714	local_irq_save(flags);
3715	rdp = this_cpu_ptr(&rcu_data);
3716	rnp = rdp->mynode;
3717	raw_spin_lock_rcu_node(rnp); // irqs already disabled.
3718	// Note it is possible for a grace period to have elapsed between
3719	// the above call to get_state_synchronize_rcu() and the below call
3720	// to rcu_seq_snap. This is OK, the worst that happens is that we
3721	// get a grace period that no one needed. These accesses are ordered
3722	// by smp_mb(), and we are accessing them in the opposite order
3723	// from which they are updated at grace-period start, as required.
3724	needwake = rcu_start_this_gp(rnp_start: rnp, rdp, gp_seq_req: rcu_seq_snap(sp: &rcu_state.gp_seq));
3725	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3726	if (needwake)
3727	rcu_gp_kthread_wake();
3728	}
3729
3730	/**
3731	* start_poll_synchronize_rcu - Snapshot and start RCU grace period
3732	*
3733	* Returns a cookie that is used by a later call to cond_synchronize_rcu()
3734	* or poll_state_synchronize_rcu() to determine whether or not a full
3735	* grace period has elapsed in the meantime. If the needed grace period
3736	* is not already slated to start, notifies RCU core of the need for that
3737	* grace period.
3738	*
3739	* Interrupts must be enabled for the case where it is necessary to awaken
3740	* the grace-period kthread.
3741	*/
3742	unsigned long start_poll_synchronize_rcu(void)
3743	{
3744	unsigned long gp_seq = get_state_synchronize_rcu();
3745
3746	start_poll_synchronize_rcu_common();
3747	return gp_seq;
3748	}
3749	EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
3750
3751	/**
3752	* start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
3753	* @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
3754	*
3755	* Places the normal and expedited grace-period states in *@rgos. This
3756	* state value can be passed to a later call to cond_synchronize_rcu_full()
3757	* or poll_state_synchronize_rcu_full() to determine whether or not a
3758	* grace period (whether normal or expedited) has elapsed in the meantime.
3759	* If the needed grace period is not already slated to start, notifies
3760	* RCU core of the need for that grace period.
3761	*
3762	* Interrupts must be enabled for the case where it is necessary to awaken
3763	* the grace-period kthread.
3764	*/
3765	void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3766	{
3767	get_state_synchronize_rcu_full(rgosp);
3768
3769	start_poll_synchronize_rcu_common();
3770	}
3771	EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
3772
3773	/**
3774	* poll_state_synchronize_rcu - Has the specified RCU grace period completed?
3775	* @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
3776	*
3777	* If a full RCU grace period has elapsed since the earlier call from
3778	* which @oldstate was obtained, return @true, otherwise return @false.
3779	* If @false is returned, it is the caller's responsibility to invoke this
3780	* function later on until it does return @true. Alternatively, the caller
3781	* can explicitly wait for a grace period, for example, by passing @oldstate
3782	* to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited()
3783	* on the one hand or by directly invoking either synchronize_rcu() or
3784	* synchronize_rcu_expedited() on the other.
3785	*
3786	* Yes, this function does not take counter wrap into account.
3787	* But counter wrap is harmless. If the counter wraps, we have waited for
3788	* more than a billion grace periods (and way more on a 64-bit system!).
3789	* Those needing to keep old state values for very long time periods
3790	* (many hours even on 32-bit systems) should check them occasionally and
3791	* either refresh them or set a flag indicating that the grace period has
3792	* completed. Alternatively, they can use get_completed_synchronize_rcu()
3793	* to get a guaranteed-completed grace-period state.
3794	*
3795	* In addition, because oldstate compresses the grace-period state for
3796	* both normal and expedited grace periods into a single unsigned long,
3797	* it can miss a grace period when synchronize_rcu() runs concurrently
3798	* with synchronize_rcu_expedited(). If this is unacceptable, please
3799	* instead use the _full() variant of these polling APIs.
3800	*
3801	* This function provides the same memory-ordering guarantees that
3802	* would be provided by a synchronize_rcu() that was invoked at the call
3803	* to the function that provided @oldstate, and that returned at the end
3804	* of this function.
3805	*/
3806	bool poll_state_synchronize_rcu(unsigned long oldstate)
3807	{
3808	if (oldstate == RCU_GET_STATE_COMPLETED \|\|
3809	rcu_seq_done_exact(sp: &rcu_state.gp_seq_polled, s: oldstate)) {
3810	smp_mb(); / Ensure GP ends before subsequent accesses. /
3811	return true;
3812	}
3813	return false;
3814	}
3815	EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
3816
3817	/**
3818	* poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
3819	* @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
3820	*
3821	* If a full RCU grace period has elapsed since the earlier call from
3822	* which *rgosp was obtained, return @true, otherwise return @false.
3823	* If @false is returned, it is the caller's responsibility to invoke this
3824	* function later on until it does return @true. Alternatively, the caller
3825	* can explicitly wait for a grace period, for example, by passing @rgosp
3826	* to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
3827	*
3828	* Yes, this function does not take counter wrap into account.
3829	* But counter wrap is harmless. If the counter wraps, we have waited
3830	* for more than a billion grace periods (and way more on a 64-bit
3831	* system!). Those needing to keep rcu_gp_oldstate values for very
3832	* long time periods (many hours even on 32-bit systems) should check
3833	* them occasionally and either refresh them or set a flag indicating
3834	* that the grace period has completed. Alternatively, they can use
3835	* get_completed_synchronize_rcu_full() to get a guaranteed-completed
3836	* grace-period state.
3837	*
3838	* This function provides the same memory-ordering guarantees that would
3839	* be provided by a synchronize_rcu() that was invoked at the call to
3840	* the function that provided @rgosp, and that returned at the end of this
3841	* function. And this guarantee requires that the root rcu_node structure's
3842	* ->gp_seq field be checked instead of that of the rcu_state structure.
3843	* The problem is that the just-ending grace-period's callbacks can be
3844	* invoked between the time that the root rcu_node structure's ->gp_seq
3845	* field is updated and the time that the rcu_state structure's ->gp_seq
3846	* field is updated. Therefore, if a single synchronize_rcu() is to
3847	* cause a subsequent poll_state_synchronize_rcu_full() to return @true,
3848	* then the root rcu_node structure is the one that needs to be polled.
3849	*/
3850	bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3851	{
3852	struct rcu_node *rnp = rcu_get_root();
3853
3854	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
3855	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED \|\|
3856	rcu_seq_done_exact(sp: &rnp->gp_seq, s: rgosp->rgos_norm) \|\|
3857	rgosp->rgos_exp == RCU_GET_STATE_COMPLETED \|\|
3858	rcu_seq_done_exact(sp: &rcu_state.expedited_sequence, s: rgosp->rgos_exp)) {
3859	smp_mb(); / Ensure GP ends before subsequent accesses. /
3860	return true;
3861	}
3862	return false;
3863	}
3864	EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
3865
3866	/**
3867	* cond_synchronize_rcu - Conditionally wait for an RCU grace period
3868	* @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
3869	*
3870	* If a full RCU grace period has elapsed since the earlier call to
3871	* get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
3872	* Otherwise, invoke synchronize_rcu() to wait for a full grace period.
3873	*
3874	* Yes, this function does not take counter wrap into account.
3875	* But counter wrap is harmless. If the counter wraps, we have waited for
3876	* more than 2 billion grace periods (and way more on a 64-bit system!),
3877	* so waiting for a couple of additional grace periods should be just fine.
3878	*
3879	* This function provides the same memory-ordering guarantees that
3880	* would be provided by a synchronize_rcu() that was invoked at the call
3881	* to the function that provided @oldstate and that returned at the end
3882	* of this function.
3883	*/
3884	void cond_synchronize_rcu(unsigned long oldstate)
3885	{
3886	if (!poll_state_synchronize_rcu(oldstate))
3887	synchronize_rcu();
3888	}
3889	EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
3890
3891	/**
3892	* cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
3893	* @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
3894	*
3895	* If a full RCU grace period has elapsed since the call to
3896	* get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
3897	* or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
3898	* obtained, just return. Otherwise, invoke synchronize_rcu() to wait
3899	* for a full grace period.
3900	*
3901	* Yes, this function does not take counter wrap into account.
3902	* But counter wrap is harmless. If the counter wraps, we have waited for
3903	* more than 2 billion grace periods (and way more on a 64-bit system!),
3904	* so waiting for a couple of additional grace periods should be just fine.
3905	*
3906	* This function provides the same memory-ordering guarantees that
3907	* would be provided by a synchronize_rcu() that was invoked at the call
3908	* to the function that provided @rgosp and that returned at the end of
3909	* this function.
3910	*/
3911	void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
3912	{
3913	if (!poll_state_synchronize_rcu_full(rgosp))
3914	synchronize_rcu();
3915	}
3916	EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
3917
3918	/*
3919	* Check to see if there is any immediate RCU-related work to be done by
3920	* the current CPU, returning 1 if so and zero otherwise. The checks are
3921	* in order of increasing expense: checks that can be carried out against
3922	* CPU-local state are performed first. However, we must check for CPU
3923	* stalls first, else we might not get a chance.
3924	*/
3925	static int rcu_pending(int user)
3926	{
3927	bool gp_in_progress;
3928	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
3929	struct rcu_node *rnp = rdp->mynode;
3930
3931	lockdep_assert_irqs_disabled();
3932
3933	/ Check for CPU stalls, if enabled. /
3934	check_cpu_stall(rdp);
3935
3936	/ Does this CPU need a deferred NOCB wakeup? /
3937	if (rcu_nocb_need_deferred_wakeup(rdp, RCU_NOCB_WAKE))
3938	return `1`;
3939
3940	/ Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) /
3941	if ((user \|\| rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
3942	return `0`;
3943
3944	/ Is the RCU core waiting for a quiescent state from this CPU? /
3945	gp_in_progress = rcu_gp_in_progress();
3946	if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
3947	return `1`;
3948
3949	/ Does this CPU have callbacks ready to invoke? /
3950	if (!rcu_rdp_is_offloaded(rdp) &&
3951	rcu_segcblist_ready_cbs(rsclp: &rdp->cblist))
3952	return `1`;
3953
3954	/ Has RCU gone idle with this CPU needing another grace period? /
3955	if (!gp_in_progress && rcu_segcblist_is_enabled(rsclp: &rdp->cblist) &&
3956	!rcu_rdp_is_offloaded(rdp) &&
3957	!rcu_segcblist_restempty(rsclp: &rdp->cblist, RCU_NEXT_READY_TAIL))
3958	return `1`;
3959
3960	/ Have RCU grace period completed or started? /
3961	if (rcu_seq_current(sp: &rnp->gp_seq) != rdp->gp_seq \|\|
3962	unlikely(READ_ONCE(rdp->gpwrap))) / outside lock /
3963	return `1`;
3964
3965	/ nothing to do /
3966	return `0`;
3967	}
3968
3969	/*
3970	* Helper function for rcu_barrier() tracing. If tracing is disabled,
3971	* the compiler is expected to optimize this away.
3972	*/
3973	static void rcu_barrier_trace(const char s, int* cpu, unsigned long done)
3974	{
3975	trace_rcu_barrier(rcuname: rcu_state.name, s, cpu,
3976	cnt: atomic_read(v: &rcu_state.barrier_cpu_count), done);
3977	}
3978
3979	/*
3980	* RCU callback function for rcu_barrier(). If we are last, wake
3981	* up the task executing rcu_barrier().
3982	*
3983	* Note that the value of rcu_state.barrier_sequence must be captured
3984	* before the atomic_dec_and_test(). Otherwise, if this CPU is not last,
3985	* other CPUs might count the value down to zero before this CPU gets
3986	* around to invoking rcu_barrier_trace(), which might result in bogus
3987	* data from the next instance of rcu_barrier().
3988	*/
3989	static void rcu_barrier_callback(struct rcu_head *rhp)
3990	{
3991	unsigned long __maybe_unused s = rcu_state.barrier_sequence;
3992
3993	if (atomic_dec_and_test(v: &rcu_state.barrier_cpu_count)) {
3994	rcu_barrier_trace(TPS("LastCB"), cpu: -`1`, done: s);
3995	complete(&rcu_state.barrier_completion);
3996	} else {
3997	rcu_barrier_trace(TPS("CB"), cpu: -`1`, done: s);
3998	}
3999	}
4000
4001	/*
4002	* If needed, entrain an rcu_barrier() callback on rdp->cblist.
4003	*/
4004	static void rcu_barrier_entrain(struct rcu_data *rdp)
4005	{
4006	unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
4007	unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
4008	bool wake_nocb = false;
4009	bool was_alldone = false;
4010
4011	lockdep_assert_held(&rcu_state.barrier_lock);
4012	if (rcu_seq_state(s: lseq) \|\| !rcu_seq_state(s: gseq) \|\| rcu_seq_ctr(s: lseq) != rcu_seq_ctr(s: gseq))
4013	return;
4014	rcu_barrier_trace(TPS("IRQ"), cpu: -`1`, done: rcu_state.barrier_sequence);
4015	rdp->barrier_head.func = rcu_barrier_callback;
4016	debug_rcu_head_queue(head: &rdp->barrier_head);
4017	rcu_nocb_lock(rdp);
4018	/*
4019	* Flush bypass and wakeup rcuog if we add callbacks to an empty regular
4020	* queue. This way we don't wait for bypass timer that can reach seconds
4021	* if it's fully lazy.
4022	*/
4023	was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(rsclp: &rdp->cblist);
4024	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false));
4025	wake_nocb = was_alldone && rcu_segcblist_pend_cbs(rsclp: &rdp->cblist);
4026	if (rcu_segcblist_entrain(rsclp: &rdp->cblist, rhp: &rdp->barrier_head)) {
4027	atomic_inc(v: &rcu_state.barrier_cpu_count);
4028	} else {
4029	debug_rcu_head_unqueue(head: &rdp->barrier_head);
4030	rcu_barrier_trace(TPS("IRQNQ"), cpu: -`1`, done: rcu_state.barrier_sequence);
4031	}
4032	rcu_nocb_unlock(rdp);
4033	if (wake_nocb)
4034	wake_nocb_gp(rdp, force: false);
4035	smp_store_release(&rdp->barrier_seq_snap, gseq);
4036	}
4037
4038	/*
4039	* Called with preemption disabled, and from cross-cpu IRQ context.
4040	*/
4041	static void rcu_barrier_handler(void *cpu_in)
4042	{
4043	uintptr_t cpu = (uintptr_t)cpu_in;
4044	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4045
4046	lockdep_assert_irqs_disabled();
4047	WARN_ON_ONCE(cpu != rdp->cpu);
4048	WARN_ON_ONCE(cpu != smp_processor_id());
4049	raw_spin_lock(&rcu_state.barrier_lock);
4050	rcu_barrier_entrain(rdp);
4051	raw_spin_unlock(&rcu_state.barrier_lock);
4052	}
4053
4054	/**
4055	* rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
4056	*
4057	* Note that this primitive does not necessarily wait for an RCU grace period
4058	* to complete. For example, if there are no RCU callbacks queued anywhere
4059	* in the system, then rcu_barrier() is within its rights to return
4060	* immediately, without waiting for anything, much less an RCU grace period.
4061	*/
4062	void rcu_barrier(void)
4063	{
4064	uintptr_t cpu;
4065	unsigned long flags;
4066	unsigned long gseq;
4067	struct rcu_data *rdp;
4068	unsigned long s = rcu_seq_snap(sp: &rcu_state.barrier_sequence);
4069
4070	rcu_barrier_trace(TPS("Begin"), cpu: -`1`, done: s);
4071
4072	/ Take mutex to serialize concurrent rcu_barrier() requests. /
4073	mutex_lock(&rcu_state.barrier_mutex);
4074
4075	/ Did someone else do our work for us? /
4076	if (rcu_seq_done(sp: &rcu_state.barrier_sequence, s)) {
4077	rcu_barrier_trace(TPS("EarlyExit"), cpu: -`1`, done: rcu_state.barrier_sequence);
4078	smp_mb(); / caller's subsequent code after above check. /
4079	mutex_unlock(lock: &rcu_state.barrier_mutex);
4080	return;
4081	}
4082
4083	/ Mark the start of the barrier operation. /
4084	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4085	rcu_seq_start(sp: &rcu_state.barrier_sequence);
4086	gseq = rcu_state.barrier_sequence;
4087	rcu_barrier_trace(TPS("Inc1"), cpu: -`1`, done: rcu_state.barrier_sequence);
4088
4089	/*
4090	* Initialize the count to two rather than to zero in order
4091	* to avoid a too-soon return to zero in case of an immediate
4092	* invocation of the just-enqueued callback (or preemption of
4093	* this task). Exclude CPU-hotplug operations to ensure that no
4094	* offline non-offloaded CPU has callbacks queued.
4095	*/
4096	init_completion(x: &rcu_state.barrier_completion);
4097	atomic_set(v: &rcu_state.barrier_cpu_count, i: `2`);
4098	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4099
4100	/*
4101	* Force each CPU with callbacks to register a new callback.
4102	* When that callback is invoked, we will know that all of the
4103	* corresponding CPU's preceding callbacks have been invoked.
4104	*/
4105	for_each_possible_cpu(cpu) {
4106	rdp = per_cpu_ptr(&rcu_data, cpu);
4107	retry:
4108	if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
4109	continue;
4110	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4111	if (!rcu_segcblist_n_cbs(rsclp: &rdp->cblist)) {
4112	WRITE_ONCE(rdp->barrier_seq_snap, gseq);
4113	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4114	rcu_barrier_trace(TPS("NQ"), cpu, done: rcu_state.barrier_sequence);
4115	continue;
4116	}
4117	if (!rcu_rdp_cpu_online(rdp)) {
4118	rcu_barrier_entrain(rdp);
4119	WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
4120	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4121	rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, done: rcu_state.barrier_sequence);
4122	continue;
4123	}
4124	raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
4125	if (smp_call_function_single(cpuid: cpu, func: rcu_barrier_handler, info: (void *)cpu, wait: `1`)) {
4126	schedule_timeout_uninterruptible(timeout: `1`);
4127	goto retry;
4128	}
4129	WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
4130	rcu_barrier_trace(TPS("OnlineQ"), cpu, done: rcu_state.barrier_sequence);
4131	}
4132
4133	/*
4134	* Now that we have an rcu_barrier_callback() callback on each
4135	* CPU, and thus each counted, remove the initial count.
4136	*/
4137	if (atomic_sub_and_test(i: `2`, v: &rcu_state.barrier_cpu_count))
4138	complete(&rcu_state.barrier_completion);
4139
4140	/ Wait for all rcu_barrier_callback() callbacks to be invoked. /
4141	wait_for_completion(&rcu_state.barrier_completion);
4142
4143	/ Mark the end of the barrier operation. /
4144	rcu_barrier_trace(TPS("Inc2"), cpu: -`1`, done: rcu_state.barrier_sequence);
4145	rcu_seq_end(sp: &rcu_state.barrier_sequence);
4146	gseq = rcu_state.barrier_sequence;
4147	for_each_possible_cpu(cpu) {
4148	rdp = per_cpu_ptr(&rcu_data, cpu);
4149
4150	WRITE_ONCE(rdp->barrier_seq_snap, gseq);
4151	}
4152
4153	/ Other rcu_barrier() invocations can now safely proceed. /
4154	mutex_unlock(lock: &rcu_state.barrier_mutex);
4155	}
4156	EXPORT_SYMBOL_GPL(rcu_barrier);
4157
4158	static unsigned long rcu_barrier_last_throttle;
4159
4160	/**
4161	* rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second
4162	*
4163	* This can be thought of as guard rails around rcu_barrier() that
4164	* permits unrestricted userspace use, at least assuming the hardware's
4165	* try_cmpxchg() is robust. There will be at most one call per second to
4166	* rcu_barrier() system-wide from use of this function, which means that
4167	* callers might needlessly wait a second or three.
4168	*
4169	* This is intended for use by test suites to avoid OOM by flushing RCU
4170	* callbacks from the previous test before starting the next. See the
4171	* rcutree.do_rcu_barrier module parameter for more information.
4172	*
4173	* Why not simply make rcu_barrier() more scalable? That might be
4174	* the eventual endpoint, but let's keep it simple for the time being.
4175	* Note that the module parameter infrastructure serializes calls to a
4176	* given .set() function, but should concurrent .set() invocation ever be
4177	* possible, we are ready!
4178	*/
4179	static void rcu_barrier_throttled(void)
4180	{
4181	unsigned long j = jiffies;
4182	unsigned long old = READ_ONCE(rcu_barrier_last_throttle);
4183	unsigned long s = rcu_seq_snap(sp: &rcu_state.barrier_sequence);
4184
4185	while (time_in_range(j, old, old + HZ / `16`) \|\|
4186	!try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) {
4187	schedule_timeout_idle(HZ / `16`);
4188	if (rcu_seq_done(sp: &rcu_state.barrier_sequence, s)) {
4189	smp_mb(); / caller's subsequent code after above check. /
4190	return;
4191	}
4192	j = jiffies;
4193	old = READ_ONCE(rcu_barrier_last_throttle);
4194	}
4195	rcu_barrier();
4196	}
4197
4198	/*
4199	* Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier
4200	* request arrives. We insist on a true value to allow for possible
4201	* future expansion.
4202	*/
4203	static int param_set_do_rcu_barrier(const char val, const* struct kernel_param *kp)
4204	{
4205	bool b;
4206	int ret;
4207
4208	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
4209	return -EAGAIN;
4210	ret = kstrtobool(s: val, res: &b);
4211	if (!ret && b) {
4212	atomic_inc(v: (atomic_t *)kp->arg);
4213	rcu_barrier_throttled();
4214	atomic_dec(v: (atomic_t *)kp->arg);
4215	}
4216	return ret;
4217	}
4218
4219	/*
4220	* Output the number of outstanding rcutree.do_rcu_barrier requests.
4221	*/
4222	static int param_get_do_rcu_barrier(char buffer, const* struct kernel_param *kp)
4223	{
4224	return sprintf(buf: buffer, fmt: "%d\n", atomic_read(v: (atomic_t *)kp->arg));
4225	}
4226
4227	static const struct kernel_param_ops do_rcu_barrier_ops = {
4228	.set = param_set_do_rcu_barrier,
4229	.get = param_get_do_rcu_barrier,
4230	};
4231	static atomic_t do_rcu_barrier;
4232	module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, `0644`);
4233
4234	/*
4235	* Compute the mask of online CPUs for the specified rcu_node structure.
4236	* This will not be stable unless the rcu_node structure's ->lock is
4237	* held, but the bit corresponding to the current CPU will be stable
4238	* in most contexts.
4239	*/
4240	static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
4241	{
4242	return READ_ONCE(rnp->qsmaskinitnext);
4243	}
4244
4245	/*
4246	* Is the CPU corresponding to the specified rcu_data structure online
4247	* from RCU's perspective? This perspective is given by that structure's
4248	* ->qsmaskinitnext field rather than by the global cpu_online_mask.
4249	*/
4250	static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
4251	{
4252	return !!(rdp->grpmask & rcu_rnp_online_cpus(rnp: rdp->mynode));
4253	}
4254
4255	bool rcu_cpu_online(int cpu)
4256	{
4257	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4258
4259	return rcu_rdp_cpu_online(rdp);
4260	}
4261
4262	#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
4263
4264	/*
4265	* Is the current CPU online as far as RCU is concerned?
4266	*
4267	* Disable preemption to avoid false positives that could otherwise
4268	* happen due to the current CPU number being sampled, this task being
4269	* preempted, its old CPU being taken offline, resuming on some other CPU,
4270	* then determining that its old CPU is now offline.
4271	*
4272	* Disable checking if in an NMI handler because we cannot safely
4273	* report errors from NMI handlers anyway. In addition, it is OK to use
4274	* RCU on an offline processor during initial boot, hence the check for
4275	* rcu_scheduler_fully_active.
4276	*/
4277	bool rcu_lockdep_current_cpu_online(void)
4278	{
4279	struct rcu_data *rdp;
4280	bool ret = false;
4281
4282	if (in_nmi() \|\| !rcu_scheduler_fully_active)
4283	return true;
4284	preempt_disable_notrace();
4285	rdp = this_cpu_ptr(&rcu_data);
4286	/*
4287	* Strictly, we care here about the case where the current CPU is
4288	* in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask
4289	* not being up to date. So arch_spin_is_locked() might have a
4290	* false positive if it's held by some other CPU, but that's
4291	* OK because that just means a false negative on the warning.
4292	*/
4293	if (rcu_rdp_cpu_online(rdp) \|\| arch_spin_is_locked(&rcu_state.ofl_lock))
4294	ret = true;
4295	preempt_enable_notrace();
4296	return ret;
4297	}
4298	EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
4299
4300	#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
4301
4302	// Has rcu_init() been invoked? This is used (for example) to determine
4303	// whether spinlocks may be acquired safely.
4304	static bool rcu_init_invoked(void)
4305	{
4306	return !!rcu_state.n_online_cpus;
4307	}
4308
4309	/*
4310	* All CPUs for the specified rcu_node structure have gone offline,
4311	* and all tasks that were preempted within an RCU read-side critical
4312	* section while running on one of those CPUs have since exited their RCU
4313	* read-side critical section. Some other CPU is reporting this fact with
4314	* the specified rcu_node structure's ->lock held and interrupts disabled.
4315	* This function therefore goes up the tree of rcu_node structures,
4316	* clearing the corresponding bits in the ->qsmaskinit fields. Note that
4317	* the leaf rcu_node structure's ->qsmaskinit field has already been
4318	* updated.
4319	*
4320	* This function does check that the specified rcu_node structure has
4321	* all CPUs offline and no blocked tasks, so it is OK to invoke it
4322	* prematurely. That said, invoking it after the fact will cost you
4323	* a needless lock acquisition. So once it has done its work, don't
4324	* invoke it again.
4325	*/
4326	static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
4327	{
4328	long mask;
4329	struct rcu_node *rnp = rnp_leaf;
4330
4331	raw_lockdep_assert_held_rcu_node(rnp_leaf);
4332	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) \|\|
4333	WARN_ON_ONCE(rnp_leaf->qsmaskinit) \|\|
4334	WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
4335	return;
4336	for (;;) {
4337	mask = rnp->grpmask;
4338	rnp = rnp->parent;
4339	if (!rnp)
4340	break;
4341	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
4342	rnp->qsmaskinit &= ~mask;
4343	/ Between grace periods, so better already be zero! /
4344	WARN_ON_ONCE(rnp->qsmask);
4345	if (rnp->qsmaskinit) {
4346	raw_spin_unlock_rcu_node(rnp);
4347	/ irqs remain disabled. /
4348	return;
4349	}
4350	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
4351	}
4352	}
4353
4354	/*
4355	* Propagate ->qsinitmask bits up the rcu_node tree to account for the
4356	* first CPU in a given leaf rcu_node structure coming online. The caller
4357	* must hold the corresponding leaf rcu_node ->lock with interrupts
4358	* disabled.
4359	*/
4360	static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
4361	{
4362	long mask;
4363	long oldmask;
4364	struct rcu_node *rnp = rnp_leaf;
4365
4366	raw_lockdep_assert_held_rcu_node(rnp_leaf);
4367	WARN_ON_ONCE(rnp->wait_blkd_tasks);
4368	for (;;) {
4369	mask = rnp->grpmask;
4370	rnp = rnp->parent;
4371	if (rnp == NULL)
4372	return;
4373	raw_spin_lock_rcu_node(rnp); / Interrupts already disabled. /
4374	oldmask = rnp->qsmaskinit;
4375	rnp->qsmaskinit \|= mask;
4376	raw_spin_unlock_rcu_node(rnp); / Interrupts remain disabled. /
4377	if (oldmask)
4378	return;
4379	}
4380	}
4381
4382	/*
4383	* Do boot-time initialization of a CPU's per-CPU RCU data.
4384	*/
4385	static void __init
4386	rcu_boot_init_percpu_data(int cpu)
4387	{
4388	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
4389	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4390
4391	/ Set up local state, ensuring consistent view of global state. /
4392	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
4393	INIT_WORK(&rdp->strict_work, strict_work_handler);
4394	WARN_ON_ONCE(ct->dynticks_nesting != `1`);
4395	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
4396	rdp->barrier_seq_snap = rcu_state.barrier_sequence;
4397	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
4398	rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
4399	rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
4400	rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
4401	rdp->last_sched_clock = jiffies;
4402	rdp->cpu = cpu;
4403	rcu_boot_init_nocb_percpu_data(rdp);
4404	}
4405
4406	struct kthread_worker *rcu_exp_gp_kworker;
4407
4408	static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
4409	{
4410	struct kthread_worker *kworker;
4411	const char *name = "rcu_exp_par_gp_kthread_worker/%d";
4412	struct sched_param param = { .sched_priority = kthread_prio };
4413	int rnp_index = rnp - rcu_get_root();
4414
4415	if (rnp->exp_kworker)
4416	return;
4417
4418	kworker = kthread_create_worker(flags: `0`, namefmt: name, rnp_index);
4419	if (IS_ERR_OR_NULL(ptr: kworker)) {
4420	pr_err("Failed to create par gp kworker on %d/%d\n",
4421	rnp->grplo, rnp->grphi);
4422	return;
4423	}
4424	WRITE_ONCE(rnp->exp_kworker, kworker);
4425
4426	if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
4427	sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
4428	}
4429
4430	static struct task_struct rcu_exp_par_gp_task(struct* rcu_node *rnp)
4431	{
4432	struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
4433
4434	if (!kworker)
4435	return NULL;
4436
4437	return kworker->task;
4438	}
4439
4440	static void __init rcu_start_exp_gp_kworker(void)
4441	{
4442	const char *name = "rcu_exp_gp_kthread_worker";
4443	struct sched_param param = { .sched_priority = kthread_prio };
4444
4445	rcu_exp_gp_kworker = kthread_create_worker(flags: `0`, namefmt: name);
4446	if (IS_ERR_OR_NULL(ptr: rcu_exp_gp_kworker)) {
4447	pr_err("Failed to create %s!\n", name);
4448	rcu_exp_gp_kworker = NULL;
4449	return;
4450	}
4451
4452	if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
4453	sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
4454	}
4455
4456	static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
4457	{
4458	if (rcu_scheduler_fully_active) {
4459	mutex_lock(&rnp->kthread_mutex);
4460	rcu_spawn_one_boost_kthread(rnp);
4461	rcu_spawn_exp_par_gp_kworker(rnp);
4462	mutex_unlock(lock: &rnp->kthread_mutex);
4463	}
4464	}
4465
4466	/*
4467	* Invoked early in the CPU-online process, when pretty much all services
4468	* are available. The incoming CPU is not present.
4469	*
4470	* Initializes a CPU's per-CPU RCU data. Note that only one online or
4471	* offline event can be happening at a given time. Note also that we can
4472	* accept some slop in the rsp->gp_seq access due to the fact that this
4473	* CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
4474	* And any offloaded callbacks are being numbered elsewhere.
4475	*/
4476	int rcutree_prepare_cpu(unsigned int cpu)
4477	{
4478	unsigned long flags;
4479	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
4480	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4481	struct rcu_node *rnp = rcu_get_root();
4482
4483	/ Set up local state, ensuring consistent view of global state. /
4484	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4485	rdp->qlen_last_fqs_check = `0`;
4486	rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
4487	rdp->blimit = blimit;
4488	ct->dynticks_nesting = `1`; / CPU not up, no tearing. /
4489	raw_spin_unlock_rcu_node(rnp); / irqs remain disabled. /
4490
4491	/*
4492	* Only non-NOCB CPUs that didn't have early-boot callbacks need to be
4493	* (re-)initialized.
4494	*/
4495	if (!rcu_segcblist_is_enabled(rsclp: &rdp->cblist))
4496	rcu_segcblist_init(rsclp: &rdp->cblist); / Re-enable callbacks. /
4497
4498	/*
4499	* Add CPU to leaf rcu_node pending-online bitmask. Any needed
4500	* propagation up the rcu_node tree will happen at the beginning
4501	* of the next grace period.
4502	*/
4503	rnp = rdp->mynode;
4504	raw_spin_lock_rcu_node(rnp); / irqs already disabled. /
4505	rdp->gp_seq = READ_ONCE(rnp->gp_seq);
4506	rdp->gp_seq_needed = rdp->gp_seq;
4507	rdp->cpu_no_qs.b.norm = true;
4508	rdp->core_needs_qs = false;
4509	rdp->rcu_iw_pending = false;
4510	rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
4511	rdp->rcu_iw_gp_seq = rdp->gp_seq - `1`;
4512	trace_rcu_grace_period(rcuname: rcu_state.name, gp_seq: rdp->gp_seq, TPS("cpuonl"));
4513	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4514	rcu_spawn_rnp_kthreads(rnp);
4515	rcu_spawn_cpu_nocb_kthread(cpu);
4516	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + `1`);
4517
4518	return `0`;
4519	}
4520
4521	/*
4522	* Update kthreads affinity during CPU-hotplug changes.
4523	*
4524	* Set the per-rcu_node kthread's affinity to cover all CPUs that are
4525	* served by the rcu_node in question. The CPU hotplug lock is still
4526	* held, so the value of rnp->qsmaskinit will be stable.
4527	*
4528	* We don't include outgoingcpu in the affinity set, use -1 if there is
4529	* no outgoing CPU. If there are no CPUs left in the affinity set,
4530	* this function allows the kthread to execute on any CPU.
4531	*
4532	* Any future concurrent calls are serialized via ->kthread_mutex.
4533	*/
4534	static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
4535	{
4536	cpumask_var_t cm;
4537	unsigned long mask;
4538	struct rcu_data *rdp;
4539	struct rcu_node *rnp;
4540	struct task_struct task_boost, task_exp;
4541
4542	rdp = per_cpu_ptr(&rcu_data, cpu);
4543	rnp = rdp->mynode;
4544
4545	task_boost = rcu_boost_task(rnp);
4546	task_exp = rcu_exp_par_gp_task(rnp);
4547
4548	/*
4549	* If CPU is the boot one, those tasks are created later from early
4550	* initcall since kthreadd must be created first.
4551	*/
4552	if (!task_boost && !task_exp)
4553	return;
4554
4555	if (!zalloc_cpumask_var(mask: &cm, GFP_KERNEL))
4556	return;
4557
4558	mutex_lock(&rnp->kthread_mutex);
4559	mask = rcu_rnp_online_cpus(rnp);
4560	for_each_leaf_node_possible_cpu(rnp, cpu)
4561	if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
4562	cpu != outgoingcpu)
4563	cpumask_set_cpu(cpu, dstp: cm);
4564	cpumask_and(dstp: cm, src1p: cm, src2p: housekeeping_cpumask(type: HK_TYPE_RCU));
4565	if (cpumask_empty(srcp: cm)) {
4566	cpumask_copy(dstp: cm, srcp: housekeeping_cpumask(type: HK_TYPE_RCU));
4567	if (outgoingcpu >= `0`)
4568	cpumask_clear_cpu(cpu: outgoingcpu, dstp: cm);
4569	}
4570
4571	if (task_exp)
4572	set_cpus_allowed_ptr(p: task_exp, new_mask: cm);
4573
4574	if (task_boost)
4575	set_cpus_allowed_ptr(p: task_boost, new_mask: cm);
4576
4577	mutex_unlock(lock: &rnp->kthread_mutex);
4578
4579	free_cpumask_var(mask: cm);
4580	}
4581
4582	/*
4583	* Has the specified (known valid) CPU ever been fully online?
4584	*/
4585	bool rcu_cpu_beenfullyonline(int cpu)
4586	{
4587	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4588
4589	return smp_load_acquire(&rdp->beenonline);
4590	}
4591
4592	/*
4593	* Near the end of the CPU-online process. Pretty much all services
4594	* enabled, and the CPU is now very much alive.
4595	*/
4596	int rcutree_online_cpu(unsigned int cpu)
4597	{
4598	unsigned long flags;
4599	struct rcu_data *rdp;
4600	struct rcu_node *rnp;
4601
4602	rdp = per_cpu_ptr(&rcu_data, cpu);
4603	rnp = rdp->mynode;
4604	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4605	rnp->ffmask \|= rdp->grpmask;
4606	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4607	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
4608	return `0`; / Too early in boot for scheduler work. /
4609	sync_sched_exp_online_cleanup(cpu);
4610	rcutree_affinity_setting(cpu, outgoingcpu: -`1`);
4611
4612	// Stop-machine done, so allow nohz_full to disable tick.
4613	tick_dep_clear(bit: TICK_DEP_BIT_RCU);
4614	return `0`;
4615	}
4616
4617	/*
4618	* Mark the specified CPU as being online so that subsequent grace periods
4619	* (both expedited and normal) will wait on it. Note that this means that
4620	* incoming CPUs are not allowed to use RCU read-side critical sections
4621	* until this function is called. Failing to observe this restriction
4622	* will result in lockdep splats.
4623	*
4624	* Note that this function is special in that it is invoked directly
4625	* from the incoming CPU rather than from the cpuhp_step mechanism.
4626	* This is because this function must be invoked at a precise location.
4627	* This incoming CPU must not have enabled interrupts yet.
4628	*
4629	* This mirrors the effects of rcutree_report_cpu_dead().
4630	*/
4631	void rcutree_report_cpu_starting(unsigned int cpu)
4632	{
4633	unsigned long mask;
4634	struct rcu_data *rdp;
4635	struct rcu_node *rnp;
4636	bool newcpu;
4637
4638	lockdep_assert_irqs_disabled();
4639	rdp = per_cpu_ptr(&rcu_data, cpu);
4640	if (rdp->cpu_started)
4641	return;
4642	rdp->cpu_started = true;
4643
4644	rnp = rdp->mynode;
4645	mask = rdp->grpmask;
4646	arch_spin_lock(&rcu_state.ofl_lock);
4647	rcu_dynticks_eqs_online();
4648	raw_spin_lock(&rcu_state.barrier_lock);
4649	raw_spin_lock_rcu_node(rnp);
4650	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext \| mask);
4651	raw_spin_unlock(&rcu_state.barrier_lock);
4652	newcpu = !(rnp->expmaskinitnext & mask);
4653	rnp->expmaskinitnext \|= mask;
4654	/ Allow lockless access for expedited grace periods. /
4655	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); / ^^^ /
4656	ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
4657	rcu_gpnum_ovf(rnp, rdp); / Offline-induced counter wrap? /
4658	rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4659	rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4660
4661	/ An incoming CPU should never be blocking a grace period. /
4662	if (WARN_ON_ONCE(rnp->qsmask & mask)) { / RCU waiting on incoming CPU? /
4663	/ rcu_report_qs_rnp() really wants some flags to restore /
4664	unsigned long flags;
4665
4666	local_irq_save(flags);
4667	rcu_disable_urgency_upon_qs(rdp);
4668	/ Report QS -after- changing ->qsmaskinitnext! /
4669	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
4670	} else {
4671	raw_spin_unlock_rcu_node(rnp);
4672	}
4673	arch_spin_unlock(&rcu_state.ofl_lock);
4674	smp_store_release(&rdp->beenonline, true);
4675	smp_mb(); / Ensure RCU read-side usage follows above initialization. /
4676	}
4677
4678	/*
4679	* The outgoing function has no further need of RCU, so remove it from
4680	* the rcu_node tree's ->qsmaskinitnext bit masks.
4681	*
4682	* Note that this function is special in that it is invoked directly
4683	* from the outgoing CPU rather than from the cpuhp_step mechanism.
4684	* This is because this function must be invoked at a precise location.
4685	*
4686	* This mirrors the effect of rcutree_report_cpu_starting().
4687	*/
4688	void rcutree_report_cpu_dead(void)
4689	{
4690	unsigned long flags;
4691	unsigned long mask;
4692	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
4693	struct rcu_node rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. /
4694
4695	/*
4696	* IRQS must be disabled from now on and until the CPU dies, or an interrupt
4697	* may introduce a new READ-side while it is actually off the QS masks.
4698	*/
4699	lockdep_assert_irqs_disabled();
4700	// Do any dangling deferred wakeups.
4701	do_nocb_deferred_wakeup(rdp);
4702
4703	rcu_preempt_deferred_qs(current);
4704
4705	/ Remove outgoing CPU from mask in the leaf rcu_node structure. /
4706	mask = rdp->grpmask;
4707	arch_spin_lock(&rcu_state.ofl_lock);
4708	raw_spin_lock_irqsave_rcu_node(rnp, flags); / Enforce GP memory-order guarantee. /
4709	rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
4710	rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
4711	if (rnp->qsmask & mask) { / RCU waiting on outgoing CPU? /
4712	/ Report quiescent state -before- changing ->qsmaskinitnext! /
4713	rcu_disable_urgency_upon_qs(rdp);
4714	rcu_report_qs_rnp(mask, rnp, gps: rnp->gp_seq, flags);
4715	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4716	}
4717	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
4718	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4719	arch_spin_unlock(&rcu_state.ofl_lock);
4720	rdp->cpu_started = false;
4721	}
4722
4723	#ifdef CONFIG_HOTPLUG_CPU
4724	/*
4725	* The outgoing CPU has just passed through the dying-idle state, and we
4726	* are being invoked from the CPU that was IPIed to continue the offline
4727	* operation. Migrate the outgoing CPU's callbacks to the current CPU.
4728	*/
4729	void rcutree_migrate_callbacks(int cpu)
4730	{
4731	unsigned long flags;
4732	struct rcu_data *my_rdp;
4733	struct rcu_node *my_rnp;
4734	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4735	bool needwake;
4736
4737	if (rcu_rdp_is_offloaded(rdp) \|\|
4738	rcu_segcblist_empty(rsclp: &rdp->cblist))
4739	return; / No callbacks to migrate. /
4740
4741	raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
4742	WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
4743	rcu_barrier_entrain(rdp);
4744	my_rdp = this_cpu_ptr(&rcu_data);
4745	my_rnp = my_rdp->mynode;
4746	rcu_nocb_lock(rdp: my_rdp); / irqs already disabled. /
4747	WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies, false));
4748	raw_spin_lock_rcu_node(my_rnp); / irqs already disabled. /
4749	/ Leverage recent GPs and set GP for new callbacks. /
4750	needwake = rcu_advance_cbs(rnp: my_rnp, rdp) \|\|
4751	rcu_advance_cbs(rnp: my_rnp, rdp: my_rdp);
4752	rcu_segcblist_merge(dst_rsclp: &my_rdp->cblist, src_rsclp: &rdp->cblist);
4753	raw_spin_unlock(&rcu_state.barrier_lock); / irqs remain disabled. /
4754	needwake = needwake \|\| rcu_advance_cbs(rnp: my_rnp, rdp: my_rdp);
4755	rcu_segcblist_disable(rsclp: &rdp->cblist);
4756	WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
4757	check_cb_ovld_locked(rdp: my_rdp, rnp: my_rnp);
4758	if (rcu_rdp_is_offloaded(rdp: my_rdp)) {
4759	raw_spin_unlock_rcu_node(my_rnp); / irqs remain disabled. /
4760	__call_rcu_nocb_wake(rdp: my_rdp, was_empty: true, flags);
4761	} else {
4762	rcu_nocb_unlock(rdp: my_rdp); / irqs remain disabled. /
4763	raw_spin_unlock_rcu_node(my_rnp); / irqs remain disabled. /
4764	}
4765	local_irq_restore(flags);
4766	if (needwake)
4767	rcu_gp_kthread_wake();
4768	lockdep_assert_irqs_enabled();
4769	WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != `0` \|\|
4770	!rcu_segcblist_empty(&rdp->cblist),
4771	"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
4772	cpu, rcu_segcblist_n_cbs(&rdp->cblist),
4773	rcu_segcblist_first_cb(&rdp->cblist));
4774	}
4775
4776	/*
4777	* The CPU has been completely removed, and some other CPU is reporting
4778	* this fact from process context. Do the remainder of the cleanup.
4779	* There can only be one CPU hotplug operation at a time, so no need for
4780	* explicit locking.
4781	*/
4782	int rcutree_dead_cpu(unsigned int cpu)
4783	{
4784	WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - `1`);
4785	// Stop-machine done, so allow nohz_full to disable tick.
4786	tick_dep_clear(bit: TICK_DEP_BIT_RCU);
4787	return `0`;
4788	}
4789
4790	/*
4791	* Near the end of the offline process. Trace the fact that this CPU
4792	* is going offline.
4793	*/
4794	int rcutree_dying_cpu(unsigned int cpu)
4795	{
4796	bool blkd;
4797	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
4798	struct rcu_node *rnp = rdp->mynode;
4799
4800	blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask);
4801	trace_rcu_grace_period(rcuname: rcu_state.name, READ_ONCE(rnp->gp_seq),
4802	gpevent: blkd ? TPS("cpuofl-bgp") : TPS("cpuofl"));
4803	return `0`;
4804	}
4805
4806	/*
4807	* Near the beginning of the process. The CPU is still very much alive
4808	* with pretty much all services enabled.
4809	*/
4810	int rcutree_offline_cpu(unsigned int cpu)
4811	{
4812	unsigned long flags;
4813	struct rcu_data *rdp;
4814	struct rcu_node *rnp;
4815
4816	rdp = per_cpu_ptr(&rcu_data, cpu);
4817	rnp = rdp->mynode;
4818	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4819	rnp->ffmask &= ~rdp->grpmask;
4820	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4821
4822	rcutree_affinity_setting(cpu, outgoingcpu: cpu);
4823
4824	// nohz_full CPUs need the tick for stop-machine to work quickly
4825	tick_dep_set(bit: TICK_DEP_BIT_RCU);
4826	return `0`;
4827	}
4828	#endif /* #ifdef CONFIG_HOTPLUG_CPU */
4829
4830	/*
4831	* On non-huge systems, use expedited RCU grace periods to make suspend
4832	* and hibernation run faster.
4833	*/
4834	static int rcu_pm_notify(struct notifier_block *self,
4835	unsigned long action, void *hcpu)
4836	{
4837	switch (action) {
4838	case PM_HIBERNATION_PREPARE:
4839	case PM_SUSPEND_PREPARE:
4840	rcu_async_hurry();
4841	rcu_expedite_gp();
4842	break;
4843	case PM_POST_HIBERNATION:
4844	case PM_POST_SUSPEND:
4845	rcu_unexpedite_gp();
4846	rcu_async_relax();
4847	break;
4848	default:
4849	break;
4850	}
4851	return NOTIFY_OK;
4852	}
4853
4854	/*
4855	* Spawn the kthreads that handle RCU's grace periods.
4856	*/
4857	static int __init rcu_spawn_gp_kthread(void)
4858	{
4859	unsigned long flags;
4860	struct rcu_node *rnp;
4861	struct sched_param sp;
4862	struct task_struct *t;
4863	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
4864
4865	rcu_scheduler_fully_active = `1`;
4866	t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
4867	if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
4868	return `0`;
4869	if (kthread_prio) {
4870	sp.sched_priority = kthread_prio;
4871	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
4872	}
4873	rnp = rcu_get_root();
4874	raw_spin_lock_irqsave_rcu_node(rnp, flags);
4875	WRITE_ONCE(rcu_state.gp_activity, jiffies);
4876	WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
4877	// Reset .gp_activity and .gp_req_activity before setting .gp_kthread.
4878	smp_store_release(&rcu_state.gp_kthread, t); / ^^^ /
4879	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4880	wake_up_process(tsk: t);
4881	/ This is a pre-SMP initcall, we expect a single CPU /
4882	WARN_ON(num_online_cpus() > `1`);
4883	/*
4884	* Those kthreads couldn't be created on rcu_init() -> rcutree_prepare_cpu()
4885	* due to rcu_scheduler_fully_active.
4886	*/
4887	rcu_spawn_cpu_nocb_kthread(smp_processor_id());
4888	rcu_spawn_rnp_kthreads(rnp: rdp->mynode);
4889	rcu_spawn_core_kthreads();
4890	/ Create kthread worker for expedited GPs /
4891	rcu_start_exp_gp_kworker();
4892	return `0`;
4893	}
4894	early_initcall(rcu_spawn_gp_kthread);
4895
4896	/*
4897	* This function is invoked towards the end of the scheduler's
4898	* initialization process. Before this is called, the idle task might
4899	* contain synchronous grace-period primitives (during which time, this idle
4900	* task is booting the system, and such primitives are no-ops). After this
4901	* function is called, any synchronous grace-period primitives are run as
4902	* expedited, with the requesting task driving the grace period forward.
4903	* A later core_initcall() rcu_set_runtime_mode() will switch to full
4904	* runtime RCU functionality.
4905	*/
4906	void rcu_scheduler_starting(void)
4907	{
4908	unsigned long flags;
4909	struct rcu_node *rnp;
4910
4911	WARN_ON(num_online_cpus() != `1`);
4912	WARN_ON(nr_context_switches() > `0`);
4913	rcu_test_sync_prims();
4914
4915	// Fix up the ->gp_seq counters.
4916	local_irq_save(flags);
4917	rcu_for_each_node_breadth_first(rnp)
4918	rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
4919	local_irq_restore(flags);
4920
4921	// Switch out of early boot mode.
4922	rcu_scheduler_active = RCU_SCHEDULER_INIT;
4923	rcu_test_sync_prims();
4924	}
4925
4926	/*
4927	* Helper function for rcu_init() that initializes the rcu_state structure.
4928	*/
4929	static void __init rcu_init_one(void)
4930	{
4931	static const char * const buf[] = RCU_NODE_NAME_INIT;
4932	static const char * const fqs[] = RCU_FQS_NAME_INIT;
4933	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4934	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4935
4936	int levelspread[RCU_NUM_LVLS]; / kids/node in each level. /
4937	int cpustride = `1`;
4938	int i;
4939	int j;
4940	struct rcu_node *rnp;
4941
4942	BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); / Fix buf[] init! /
4943
4944	/ Silence gcc 4.8 false positive about array index out of range. /
4945	if (rcu_num_lvls <= `0` \|\| rcu_num_lvls > RCU_NUM_LVLS)
4946	panic(fmt: "rcu_init_one: rcu_num_lvls out of range");
4947
4948	/ Initialize the level-tracking arrays. /
4949
4950	for (i = `1`; i < rcu_num_lvls; i++)
4951	rcu_state.level[i] =
4952	rcu_state.level[i - `1`] + num_rcu_lvl[i - `1`];
4953	rcu_init_levelspread(levelspread, levelcnt: num_rcu_lvl);
4954
4955	/ Initialize the elements themselves, starting from the leaves. /
4956
4957	for (i = rcu_num_lvls - `1`; i >= `0`; i--) {
4958	cpustride *= levelspread[i];
4959	rnp = rcu_state.level[i];
4960	for (j = `0`; j < num_rcu_lvl[i]; j++, rnp++) {
4961	raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
4962	lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
4963	&rcu_node_class[i], buf[i]);
4964	raw_spin_lock_init(&rnp->fqslock);
4965	lockdep_set_class_and_name(&rnp->fqslock,
4966	&rcu_fqs_class[i], fqs[i]);
4967	rnp->gp_seq = rcu_state.gp_seq;
4968	rnp->gp_seq_needed = rcu_state.gp_seq;
4969	rnp->completedqs = rcu_state.gp_seq;
4970	rnp->qsmask = `0`;
4971	rnp->qsmaskinit = `0`;
4972	rnp->grplo = j * cpustride;
4973	rnp->grphi = (j + `1`) * cpustride - `1`;
4974	if (rnp->grphi >= nr_cpu_ids)
4975	rnp->grphi = nr_cpu_ids - `1`;
4976	if (i == `0`) {
4977	rnp->grpnum = `0`;
4978	rnp->grpmask = `0`;
4979	rnp->parent = NULL;
4980	} else {
4981	rnp->grpnum = j % levelspread[i - `1`];
4982	rnp->grpmask = BIT(rnp->grpnum);
4983	rnp->parent = rcu_state.level[i - `1`] +
4984	j / levelspread[i - `1`];
4985	}
4986	rnp->level = i;
4987	INIT_LIST_HEAD(list: &rnp->blkd_tasks);
4988	rcu_init_one_nocb(rnp);
4989	init_waitqueue_head(&rnp->exp_wq[`0`]);
4990	init_waitqueue_head(&rnp->exp_wq[`1`]);
4991	init_waitqueue_head(&rnp->exp_wq[`2`]);
4992	init_waitqueue_head(&rnp->exp_wq[`3`]);
4993	spin_lock_init(&rnp->exp_lock);
4994	mutex_init(&rnp->kthread_mutex);
4995	raw_spin_lock_init(&rnp->exp_poll_lock);
4996	rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
4997	INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
4998	}
4999	}
5000
5001	init_swait_queue_head(&rcu_state.gp_wq);
5002	init_swait_queue_head(&rcu_state.expedited_wq);
5003	rnp = rcu_first_leaf_node();
5004	for_each_possible_cpu(i) {
5005	while (i > rnp->grphi)
5006	rnp++;
5007	per_cpu_ptr(&rcu_data, i)->mynode = rnp;
5008	rcu_boot_init_percpu_data(cpu: i);
5009	}
5010	}
5011
5012	/*
5013	* Force priority from the kernel command-line into range.
5014	*/
5015	static void __init sanitize_kthread_prio(void)
5016	{
5017	int kthread_prio_in = kthread_prio;
5018
5019	if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < `2`
5020	&& IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
5021	kthread_prio = `2`;
5022	else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < `1`)
5023	kthread_prio = `1`;
5024	else if (kthread_prio < `0`)
5025	kthread_prio = `0`;
5026	else if (kthread_prio > `99`)
5027	kthread_prio = `99`;
5028
5029	if (kthread_prio != kthread_prio_in)
5030	pr_alert("%s: Limited prio to %d from %d\n",
5031	__func__, kthread_prio, kthread_prio_in);
5032	}
5033
5034	/*
5035	* Compute the rcu_node tree geometry from kernel parameters. This cannot
5036	* replace the definitions in tree.h because those are needed to size
5037	* the ->node array in the rcu_state structure.
5038	*/
5039	void rcu_init_geometry(void)
5040	{
5041	ulong d;
5042	int i;
5043	static unsigned long old_nr_cpu_ids;
5044	int rcu_capacity[RCU_NUM_LVLS];
5045	static bool initialized;
5046
5047	if (initialized) {
5048	/*
5049	* Warn if setup_nr_cpu_ids() had not yet been invoked,
5050	* unless nr_cpus_ids == NR_CPUS, in which case who cares?
5051	*/
5052	WARN_ON_ONCE(old_nr_cpu_ids != nr_cpu_ids);
5053	return;
5054	}
5055
5056	old_nr_cpu_ids = nr_cpu_ids;
5057	initialized = true;
5058
5059	/*
5060	* Initialize any unspecified boot parameters.
5061	* The default values of jiffies_till_first_fqs and
5062	* jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
5063	* value, which is a function of HZ, then adding one for each
5064	* RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
5065	*/
5066	d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
5067	if (jiffies_till_first_fqs == ULONG_MAX)
5068	jiffies_till_first_fqs = d;
5069	if (jiffies_till_next_fqs == ULONG_MAX)
5070	jiffies_till_next_fqs = d;
5071	adjust_jiffies_till_sched_qs();
5072
5073	/ If the compile-time values are accurate, just leave. /
5074	if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
5075	nr_cpu_ids == NR_CPUS)
5076	return;
5077	pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
5078	rcu_fanout_leaf, nr_cpu_ids);
5079
5080	/*
5081	* The boot-time rcu_fanout_leaf parameter must be at least two
5082	* and cannot exceed the number of bits in the rcu_node masks.
5083	* Complain and fall back to the compile-time values if this
5084	* limit is exceeded.
5085	*/
5086	if (rcu_fanout_leaf < `2` \|\|
5087	rcu_fanout_leaf > sizeof(unsigned long) * `8`) {
5088	rcu_fanout_leaf = RCU_FANOUT_LEAF;
5089	WARN_ON(`1`);
5090	return;
5091	}
5092
5093	/*
5094	* Compute number of nodes that can be handled an rcu_node tree
5095	* with the given number of levels.
5096	*/
5097	rcu_capacity[`0`] = rcu_fanout_leaf;
5098	for (i = `1`; i < RCU_NUM_LVLS; i++)
5099	rcu_capacity[i] = rcu_capacity[i - `1`] * RCU_FANOUT;
5100
5101	/*
5102	* The tree must be able to accommodate the configured number of CPUs.
5103	* If this limit is exceeded, fall back to the compile-time values.
5104	*/
5105	if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - `1`]) {
5106	rcu_fanout_leaf = RCU_FANOUT_LEAF;
5107	WARN_ON(`1`);
5108	return;
5109	}
5110
5111	/ Calculate the number of levels in the tree. /
5112	for (i = `0`; nr_cpu_ids > rcu_capacity[i]; i++) {
5113	}
5114	rcu_num_lvls = i + `1`;
5115
5116	/ Calculate the number of rcu_nodes at each level of the tree. /
5117	for (i = `0`; i < rcu_num_lvls; i++) {
5118	int cap = rcu_capacity[(rcu_num_lvls - `1`) - i];
5119	num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
5120	}
5121
5122	/ Calculate the total number of rcu_node structures. /
5123	rcu_num_nodes = `0`;
5124	for (i = `0`; i < rcu_num_lvls; i++)
5125	rcu_num_nodes += num_rcu_lvl[i];
5126	}
5127
5128	/*
5129	* Dump out the structure of the rcu_node combining tree associated
5130	* with the rcu_state structure.
5131	*/
5132	static void __init rcu_dump_rcu_node_tree(void)
5133	{
5134	int level = `0`;
5135	struct rcu_node *rnp;
5136
5137	pr_info("rcu_node tree layout dump\n");
5138	pr_info(" ");
5139	rcu_for_each_node_breadth_first(rnp) {
5140	if (rnp->level != level) {
5141	pr_cont("\n");
5142	pr_info(" ");
5143	level = rnp->level;
5144	}
5145	pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum);
5146	}
5147	pr_cont("\n");
5148	}
5149
5150	struct workqueue_struct *rcu_gp_wq;
5151
5152	static void __init kfree_rcu_batch_init(void)
5153	{
5154	int cpu;
5155	int i, j;
5156	struct shrinker *kfree_rcu_shrinker;
5157
5158	/ Clamp it to [0:100] seconds interval. /
5159	if (rcu_delay_page_cache_fill_msec < `0` \|\|
5160	rcu_delay_page_cache_fill_msec > `100` * MSEC_PER_SEC) {
5161
5162	rcu_delay_page_cache_fill_msec =
5163	clamp(rcu_delay_page_cache_fill_msec, `0`,
5164	(int) (`100` * MSEC_PER_SEC));
5165
5166	pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
5167	rcu_delay_page_cache_fill_msec);
5168	}
5169
5170	for_each_possible_cpu(cpu) {
5171	struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
5172
5173	for (i = `0`; i < KFREE_N_BATCHES; i++) {
5174	INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
5175	krcp->krw_arr[i].krcp = krcp;
5176
5177	for (j = `0`; j < FREE_N_CHANNELS; j++)
5178	INIT_LIST_HEAD(list: &krcp->krw_arr[i].bulk_head_free[j]);
5179	}
5180
5181	for (i = `0`; i < FREE_N_CHANNELS; i++)
5182	INIT_LIST_HEAD(list: &krcp->bulk_head[i]);
5183
5184	INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
5185	INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
5186	krcp->initialized = true;
5187	}
5188
5189	kfree_rcu_shrinker = shrinker_alloc(flags: `0`, fmt: "rcu-kfree");
5190	if (!kfree_rcu_shrinker) {
5191	pr_err("Failed to allocate kfree_rcu() shrinker!\n");
5192	return;
5193	}
5194
5195	kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
5196	kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
5197
5198	shrinker_register(shrinker: kfree_rcu_shrinker);
5199	}
5200
5201	void __init rcu_init(void)
5202	{
5203	int cpu = smp_processor_id();
5204
5205	rcu_early_boot_tests();
5206
5207	kfree_rcu_batch_init();
5208	rcu_bootup_announce();
5209	sanitize_kthread_prio();
5210	rcu_init_geometry();
5211	rcu_init_one();
5212	if (dump_tree)
5213	rcu_dump_rcu_node_tree();
5214	if (use_softirq)
5215	open_softirq(nr: RCU_SOFTIRQ, action: rcu_core_si);
5216
5217	/*
5218	* We don't need protection against CPU-hotplug here because
5219	* this is called early in boot, before either interrupts
5220	* or the scheduler are operational.
5221	*/
5222	pm_notifier(rcu_pm_notify, `0`);
5223	WARN_ON(num_online_cpus() > `1`); // Only one CPU this early in boot.
5224	rcutree_prepare_cpu(cpu);
5225	rcutree_report_cpu_starting(cpu);
5226	rcutree_online_cpu(cpu);
5227
5228	/ Create workqueue for Tree SRCU and for expedited GPs. /
5229	rcu_gp_wq = alloc_workqueue(fmt: "rcu_gp", flags: WQ_MEM_RECLAIM, max_active: `0`);
5230	WARN_ON(!rcu_gp_wq);
5231
5232	/ Fill in default value for rcutree.qovld boot parameter. /
5233	/ -After- the rcu_node ->lock fields are initialized! /
5234	if (qovld < `0`)
5235	qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
5236	else
5237	qovld_calc = qovld;
5238
5239	// Kick-start in case any polled grace periods started early.
5240	(void)start_poll_synchronize_rcu_expedited();
5241
5242	rcu_test_sync_prims();
5243
5244	tasks_cblist_init_generic();
5245	}
5246
5247	#include "tree_stall.h"
5248	#include "tree_exp.h"
5249	#include "tree_nocb.h"
5250	#include "tree_plugin.h"
5251

source code of linux/kernel/rcu/tree.c