srcutree.c source code [linux/kernel/rcu/srcutree.c]

1	// SPDX-License-Identifier: GPL-2.0+
2	/*
3	* Sleepable Read-Copy Update mechanism for mutual exclusion.
4	*
5	* Copyright (C) IBM Corporation, 2006
6	* Copyright (C) Fujitsu, 2012
7	*
8	* Authors: Paul McKenney <paulmck@linux.ibm.com>
9	* Lai Jiangshan <laijs@cn.fujitsu.com>
10	*
11	* For detailed explanation of Read-Copy Update mechanism see -
12	* Documentation/RCU/ *.txt
13	*
14	*/
15
16	#define pr_fmt(fmt) "rcu: " fmt
17
18	#include <linux/export.h>
19	#include <linux/mutex.h>
20	#include <linux/percpu.h>
21	#include <linux/preempt.h>
22	#include <linux/rcupdate_wait.h>
23	#include <linux/sched.h>
24	#include <linux/smp.h>
25	#include <linux/delay.h>
26	#include <linux/module.h>
27	#include <linux/slab.h>
28	#include <linux/srcu.h>
29
30	#include "rcu.h"
31	#include "rcu_segcblist.h"
32
33	/ Holdoff in nanoseconds for auto-expediting. /
34	#define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
35	static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
36	module_param(exp_holdoff, ulong, `0444`);
37
38	/ Overflow-check frequency. N bits roughly says every 2*N grace periods. /*
39	static ulong counter_wrap_check = (ULONG_MAX >> `2`);
40	module_param(counter_wrap_check, ulong, `0444`);
41
42	/*
43	* Control conversion to SRCU_SIZE_BIG:
44	* 0: Don't convert at all.
45	* 1: Convert at init_srcu_struct() time.
46	* 2: Convert when rcutorture invokes srcu_torture_stats_print().
47	* 3: Decide at boot time based on system shape (default).
48	* 0x1x: Convert when excessive contention encountered.
49	*/
50	#define SRCU_SIZING_NONE 0
51	#define SRCU_SIZING_INIT 1
52	#define SRCU_SIZING_TORTURE 2
53	#define SRCU_SIZING_AUTO 3
54	#define SRCU_SIZING_CONTEND 0x10
55	#define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x)
56	#define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE))
57	#define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT))
58	#define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE))
59	#define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND)
60	static int convert_to_big = SRCU_SIZING_AUTO;
61	module_param(convert_to_big, int, `0444`);
62
63	/ Number of CPUs to trigger init_srcu_struct()-time transition to big. /
64	static int big_cpu_lim __read_mostly = `128`;
65	module_param(big_cpu_lim, int, `0444`);
66
67	/ Contention events per jiffy to initiate transition to big. /
68	static int small_contention_lim __read_mostly = `100`;
69	module_param(small_contention_lim, int, `0444`);
70
71	/ Early-boot callback-management, so early that no lock is required! /
72	static LIST_HEAD(srcu_boot_list);
73	static bool __read_mostly srcu_init_done;
74
75	static void srcu_invoke_callbacks(struct work_struct *work);
76	static void srcu_reschedule(struct srcu_struct ssp, unsigned* long delay);
77	static void process_srcu(struct work_struct *work);
78	static void srcu_delay_timer(struct timer_list *t);
79
80	/ Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). /
81	#define spin_lock_rcu_node(p) \
82	do { \
83	spin_lock(&ACCESS_PRIVATE(p, lock)); \
84	smp_mb__after_unlock_lock(); \
85	} while (0)
86
87	#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock))
88
89	#define spin_lock_irq_rcu_node(p) \
90	do { \
91	spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \
92	smp_mb__after_unlock_lock(); \
93	} while (0)
94
95	#define spin_unlock_irq_rcu_node(p) \
96	spin_unlock_irq(&ACCESS_PRIVATE(p, lock))
97
98	#define spin_lock_irqsave_rcu_node(p, flags) \
99	do { \
100	spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
101	smp_mb__after_unlock_lock(); \
102	} while (0)
103
104	#define spin_trylock_irqsave_rcu_node(p, flags) \
105	({ \
106	bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
107	\
108	if (___locked) \
109	smp_mb__after_unlock_lock(); \
110	___locked; \
111	})
112
113	#define spin_unlock_irqrestore_rcu_node(p, flags) \
114	spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \
115
116	/*
117	* Initialize SRCU per-CPU data. Note that statically allocated
118	* srcu_struct structures might already have srcu_read_lock() and
119	* srcu_read_unlock() running against them. So if the is_static parameter
120	* is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
121	*/
122	static void init_srcu_struct_data(struct srcu_struct *ssp)
123	{
124	int cpu;
125	struct srcu_data *sdp;
126
127	/*
128	* Initialize the per-CPU srcu_data array, which feeds into the
129	* leaves of the srcu_node tree.
130	*/
131	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
132	ARRAY_SIZE(sdp->srcu_unlock_count));
133	for_each_possible_cpu(cpu) {
134	sdp = per_cpu_ptr(ssp->sda, cpu);
135	spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
136	rcu_segcblist_init(rsclp: &sdp->srcu_cblist);
137	sdp->srcu_cblist_invoking = false;
138	sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
139	sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
140	sdp->mynode = NULL;
141	sdp->cpu = cpu;
142	INIT_WORK(&sdp->work, srcu_invoke_callbacks);
143	timer_setup(&sdp->delay_work, srcu_delay_timer, `0`);
144	sdp->ssp = ssp;
145	}
146	}
147
148	/ Invalid seq state, used during snp node initialization /
149	#define SRCU_SNP_INIT_SEQ 0x2
150
151	/*
152	* Check whether sequence number corresponding to snp node,
153	* is invalid.
154	*/
155	static inline bool srcu_invl_snp_seq(unsigned long s)
156	{
157	return s == SRCU_SNP_INIT_SEQ;
158	}
159
160	/*
161	* Allocated and initialize SRCU combining tree. Returns @true if
162	* allocation succeeded and @false otherwise.
163	*/
164	static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
165	{
166	int cpu;
167	int i;
168	int level = `0`;
169	int levelspread[RCU_NUM_LVLS];
170	struct srcu_data *sdp;
171	struct srcu_node *snp;
172	struct srcu_node *snp_first;
173
174	/ Initialize geometry if it has not already been initialized. /
175	rcu_init_geometry();
176	ssp->srcu_sup->node = kcalloc(n: rcu_num_nodes, size: sizeof(*ssp->srcu_sup->node), flags: gfp_flags);
177	if (!ssp->srcu_sup->node)
178	return false;
179
180	/ Work out the overall tree geometry. /
181	ssp->srcu_sup->level[`0`] = &ssp->srcu_sup->node[`0`];
182	for (i = `1`; i < rcu_num_lvls; i++)
183	ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - `1`] + num_rcu_lvl[i - `1`];
184	rcu_init_levelspread(levelspread, levelcnt: num_rcu_lvl);
185
186	/ Each pass through this loop initializes one srcu_node structure. /
187	srcu_for_each_node_breadth_first(ssp, snp) {
188	spin_lock_init(&ACCESS_PRIVATE(snp, lock));
189	WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
190	ARRAY_SIZE(snp->srcu_data_have_cbs));
191	for (i = `0`; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
192	snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ;
193	snp->srcu_data_have_cbs[i] = `0`;
194	}
195	snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
196	snp->grplo = -`1`;
197	snp->grphi = -`1`;
198	if (snp == &ssp->srcu_sup->node[`0`]) {
199	/ Root node, special case. /
200	snp->srcu_parent = NULL;
201	continue;
202	}
203
204	/ Non-root node. /
205	if (snp == ssp->srcu_sup->level[level + `1`])
206	level++;
207	snp->srcu_parent = ssp->srcu_sup->level[level - `1`] +
208	(snp - ssp->srcu_sup->level[level]) /
209	levelspread[level - `1`];
210	}
211
212	/*
213	* Initialize the per-CPU srcu_data array, which feeds into the
214	* leaves of the srcu_node tree.
215	*/
216	level = rcu_num_lvls - `1`;
217	snp_first = ssp->srcu_sup->level[level];
218	for_each_possible_cpu(cpu) {
219	sdp = per_cpu_ptr(ssp->sda, cpu);
220	sdp->mynode = &snp_first[cpu / levelspread[level]];
221	for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
222	if (snp->grplo < `0`)
223	snp->grplo = cpu;
224	snp->grphi = cpu;
225	}
226	sdp->grpmask = `1UL` << (cpu - sdp->mynode->grplo);
227	}
228	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
229	return true;
230	}
231
232	/*
233	* Initialize non-compile-time initialized fields, including the
234	* associated srcu_node and srcu_data structures. The is_static parameter
235	* tells us that ->sda has already been wired up to srcu_data.
236	*/
237	static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
238	{
239	if (!is_static)
240	ssp->srcu_sup = kzalloc(size: sizeof(*ssp->srcu_sup), GFP_KERNEL);
241	if (!ssp->srcu_sup)
242	return -ENOMEM;
243	if (!is_static)
244	spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
245	ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
246	ssp->srcu_sup->node = NULL;
247	mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
248	mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
249	ssp->srcu_idx = `0`;
250	ssp->srcu_sup->srcu_gp_seq = `0`;
251	ssp->srcu_sup->srcu_barrier_seq = `0`;
252	mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
253	atomic_set(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt, i: `0`);
254	INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
255	ssp->srcu_sup->sda_is_static = is_static;
256	if (!is_static)
257	ssp->sda = alloc_percpu(struct srcu_data);
258	if (!ssp->sda)
259	goto err_free_sup;
260	init_srcu_struct_data(ssp);
261	ssp->srcu_sup->srcu_gp_seq_needed_exp = `0`;
262	ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
263	if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
264	if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC))
265	goto err_free_sda;
266	WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
267	}
268	ssp->srcu_sup->srcu_ssp = ssp;
269	smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, `0`); / Init done. /
270	return `0`;
271
272	err_free_sda:
273	if (!is_static) {
274	free_percpu(pdata: ssp->sda);
275	ssp->sda = NULL;
276	}
277	err_free_sup:
278	if (!is_static) {
279	kfree(objp: ssp->srcu_sup);
280	ssp->srcu_sup = NULL;
281	}
282	return -ENOMEM;
283	}
284
285	#ifdef CONFIG_DEBUG_LOCK_ALLOC
286
287	int __init_srcu_struct(struct srcu_struct ssp, const* char *name,
288	struct lock_class_key *key)
289	{
290	/ Don't re-initialize a lock while it is held. /
291	debug_check_no_locks_freed(from: (void )ssp, len: sizeof(ssp));
292	lockdep_init_map(lock: &ssp->dep_map, name, key, subclass: `0`);
293	return init_srcu_struct_fields(ssp, is_static: false);
294	}
295	EXPORT_SYMBOL_GPL(__init_srcu_struct);
296
297	#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
298
299	/**
300	* init_srcu_struct - initialize a sleep-RCU structure
301	* @ssp: structure to initialize.
302	*
303	* Must invoke this on a given srcu_struct before passing that srcu_struct
304	* to any other function. Each srcu_struct represents a separate domain
305	* of SRCU protection.
306	*/
307	int init_srcu_struct(struct srcu_struct *ssp)
308	{
309	return init_srcu_struct_fields(ssp, false);
310	}
311	EXPORT_SYMBOL_GPL(init_srcu_struct);
312
313	#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
314
315	/*
316	* Initiate a transition to SRCU_SIZE_BIG with lock held.
317	*/
318	static void __srcu_transition_to_big(struct srcu_struct *ssp)
319	{
320	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
321	smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
322	}
323
324	/*
325	* Initiate an idempotent transition to SRCU_SIZE_BIG.
326	*/
327	static void srcu_transition_to_big(struct srcu_struct *ssp)
328	{
329	unsigned long flags;
330
331	/ Double-checked locking on ->srcu_size-state. /
332	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
333	return;
334	spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
335	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
336	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
337	return;
338	}
339	__srcu_transition_to_big(ssp);
340	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
341	}
342
343	/*
344	* Check to see if the just-encountered contention event justifies
345	* a transition to SRCU_SIZE_BIG.
346	*/
347	static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
348	{
349	unsigned long j;
350
351	if (!SRCU_SIZING_IS_CONTEND() \|\| ssp->srcu_sup->srcu_size_state)
352	return;
353	j = jiffies;
354	if (ssp->srcu_sup->srcu_size_jiffies != j) {
355	ssp->srcu_sup->srcu_size_jiffies = j;
356	ssp->srcu_sup->srcu_n_lock_retries = `0`;
357	}
358	if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
359	return;
360	__srcu_transition_to_big(ssp);
361	}
362
363	/*
364	* Acquire the specified srcu_data structure's ->lock, but check for
365	* excessive contention, which results in initiation of a transition
366	* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
367	* parameter permits this.
368	*/
369	static void spin_lock_irqsave_sdp_contention(struct srcu_data sdp, unsigned* long *flags)
370	{
371	struct srcu_struct *ssp = sdp->ssp;
372
373	if (spin_trylock_irqsave_rcu_node(sdp, *flags))
374	return;
375	spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
376	spin_lock_irqsave_check_contention(ssp);
377	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
378	spin_lock_irqsave_rcu_node(sdp, *flags);
379	}
380
381	/*
382	* Acquire the specified srcu_struct structure's ->lock, but check for
383	* excessive contention, which results in initiation of a transition
384	* to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module
385	* parameter permits this.
386	*/
387	static void spin_lock_irqsave_ssp_contention(struct srcu_struct ssp, unsigned* long *flags)
388	{
389	if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
390	return;
391	spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
392	spin_lock_irqsave_check_contention(ssp);
393	}
394
395	/*
396	* First-use initialization of statically allocated srcu_struct
397	* structure. Wiring up the combining tree is more than can be
398	* done with compile-time initialization, so this check is added
399	* to each update-side SRCU primitive. Use ssp->lock, which -is-
400	* compile-time initialized, to resolve races involving multiple
401	* CPUs trying to garner first-use privileges.
402	*/
403	static void check_init_srcu_struct(struct srcu_struct *ssp)
404	{
405	unsigned long flags;
406
407	/ The smp_load_acquire() pairs with the smp_store_release(). /
408	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /^^^/
409	return; / Already initialized. /
410	spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
411	if (!rcu_seq_state(s: ssp->srcu_sup->srcu_gp_seq_needed)) {
412	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
413	return;
414	}
415	init_srcu_struct_fields(ssp, is_static: true);
416	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
417	}
418
419	/*
420	* Returns approximate total of the readers' ->srcu_lock_count[] values
421	* for the rank of per-CPU counters specified by idx.
422	*/
423	static unsigned long srcu_readers_lock_idx(struct srcu_struct ssp, int* idx)
424	{
425	int cpu;
426	unsigned long sum = `0`;
427
428	for_each_possible_cpu(cpu) {
429	struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
430
431	sum += atomic_long_read(v: &cpuc->srcu_lock_count[idx]);
432	}
433	return sum;
434	}
435
436	/*
437	* Returns approximate total of the readers' ->srcu_unlock_count[] values
438	* for the rank of per-CPU counters specified by idx.
439	*/
440	static unsigned long srcu_readers_unlock_idx(struct srcu_struct ssp, int* idx)
441	{
442	int cpu;
443	unsigned long mask = `0`;
444	unsigned long sum = `0`;
445
446	for_each_possible_cpu(cpu) {
447	struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
448
449	sum += atomic_long_read(v: &cpuc->srcu_unlock_count[idx]);
450	if (IS_ENABLED(CONFIG_PROVE_RCU))
451	mask = mask \| READ_ONCE(cpuc->srcu_nmi_safety);
452	}
453	WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> `1`)),
454	"Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp);
455	return sum;
456	}
457
458	/*
459	* Return true if the number of pre-existing readers is determined to
460	* be zero.
461	*/
462	static bool srcu_readers_active_idx_check(struct srcu_struct ssp, int* idx)
463	{
464	unsigned long unlocks;
465
466	unlocks = srcu_readers_unlock_idx(ssp, idx);
467
468	/*
469	* Make sure that a lock is always counted if the corresponding
470	* unlock is counted. Needs to be a smp_mb() as the read side may
471	* contain a read from a variable that is written to before the
472	* synchronize_srcu() in the write side. In this case smp_mb()s
473	* A and B act like the store buffering pattern.
474	*
475	* This smp_mb() also pairs with smp_mb() C to prevent accesses
476	* after the synchronize_srcu() from being executed before the
477	* grace period ends.
478	*/
479	smp_mb(); / A /
480
481	/*
482	* If the locks are the same as the unlocks, then there must have
483	* been no readers on this index at some point in this function.
484	* But there might be more readers, as a task might have read
485	* the current ->srcu_idx but not yet have incremented its CPU's
486	* ->srcu_lock_count[idx] counter. In fact, it is possible
487	* that most of the tasks have been preempted between fetching
488	* ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
489	* could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
490	* in a system whose address space was fully populated with memory.
491	* Call this quantity Nt.
492	*
493	* So suppose that the updater is preempted at this point in the
494	* code for a long time. That now-preempted updater has already
495	* flipped ->srcu_idx (possibly during the preceding grace period),
496	* done an smp_mb() (again, possibly during the preceding grace
497	* period), and summed up the ->srcu_unlock_count[idx] counters.
498	* How many times can a given one of the aforementioned Nt tasks
499	* increment the old ->srcu_idx value's ->srcu_lock_count[idx]
500	* counter, in the absence of nesting?
501	*
502	* It can clearly do so once, given that it has already fetched
503	* the old value of ->srcu_idx and is just about to use that value
504	* to index its increment of ->srcu_lock_count[idx]. But as soon as
505	* it leaves that SRCU read-side critical section, it will increment
506	* ->srcu_unlock_count[idx], which must follow the updater's above
507	* read from that same value. Thus, as soon the reading task does
508	* an smp_mb() and a later fetch from ->srcu_idx, that task will be
509	* guaranteed to get the new index. Except that the increment of
510	* ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
511	* smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
512	* is before the smp_mb(). Thus, that task might not see the new
513	* value of ->srcu_idx until the -second- __srcu_read_lock(),
514	* which in turn means that this task might well increment
515	* ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
516	* not just once.
517	*
518	* However, it is important to note that a given smp_mb() takes
519	* effect not just for the task executing it, but also for any
520	* later task running on that same CPU.
521	*
522	* That is, there can be almost Nt + Nc further increments of
523	* ->srcu_lock_count[idx] for the old index, where Nc is the number
524	* of CPUs. But this is OK because the size of the task_struct
525	* structure limits the value of Nt and current systems limit Nc
526	* to a few thousand.
527	*
528	* OK, but what about nesting? This does impose a limit on
529	* nesting of half of the size of the task_struct structure
530	* (measured in bytes), which should be sufficient. A late 2022
531	* TREE01 rcutorture run reported this size to be no less than
532	* 9408 bytes, allowing up to 4704 levels of nesting, which is
533	* comfortably beyond excessive. Especially on 64-bit systems,
534	* which are unlikely to be configured with an address space fully
535	* populated with memory, at least not anytime soon.
536	*/
537	return srcu_readers_lock_idx(ssp, idx) == unlocks;
538	}
539
540	/**
541	* srcu_readers_active - returns true if there are readers. and false
542	* otherwise
543	* @ssp: which srcu_struct to count active readers (holding srcu_read_lock).
544	*
545	* Note that this is not an atomic primitive, and can therefore suffer
546	* severe errors when invoked on an active srcu_struct. That said, it
547	* can be useful as an error check at cleanup time.
548	*/
549	static bool srcu_readers_active(struct srcu_struct *ssp)
550	{
551	int cpu;
552	unsigned long sum = `0`;
553
554	for_each_possible_cpu(cpu) {
555	struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu);
556
557	sum += atomic_long_read(v: &cpuc->srcu_lock_count[`0`]);
558	sum += atomic_long_read(v: &cpuc->srcu_lock_count[`1`]);
559	sum -= atomic_long_read(v: &cpuc->srcu_unlock_count[`0`]);
560	sum -= atomic_long_read(v: &cpuc->srcu_unlock_count[`1`]);
561	}
562	return sum;
563	}
564
565	/*
566	* We use an adaptive strategy for synchronize_srcu() and especially for
567	* synchronize_srcu_expedited(). We spin for a fixed time period
568	* (defined below, boot time configurable) to allow SRCU readers to exit
569	* their read-side critical sections. If there are still some readers
570	* after one jiffy, we repeatedly block for one jiffy time periods.
571	* The blocking time is increased as the grace-period age increases,
572	* with max blocking time capped at 10 jiffies.
573	*/
574	#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
575
576	static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
577	module_param(srcu_retry_check_delay, ulong, `0444`);
578
579	#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
580	#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
581
582	#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
583	// no-delay instances.
584	#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
585	// no-delay instances.
586
587	#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
588	#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
589	#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
590	// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
591	// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
592	// called from process_srcu().
593	#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
594	(2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
595
596	// Maximum per-GP-phase consecutive no-delay instances.
597	#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
598	SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
599	SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
600	SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
601
602	static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
603	module_param(srcu_max_nodelay_phase, ulong, `0444`);
604
605	// Maximum consecutive no-delay instances.
606	#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
607	SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
608
609	static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
610	module_param(srcu_max_nodelay, ulong, `0444`);
611
612	/*
613	* Return grace-period delay, zero if there are expedited grace
614	* periods pending, SRCU_INTERVAL otherwise.
615	*/
616	static unsigned long srcu_get_delay(struct srcu_struct *ssp)
617	{
618	unsigned long gpstart;
619	unsigned long j;
620	unsigned long jbase = SRCU_INTERVAL;
621	struct srcu_usage *sup = ssp->srcu_sup;
622
623	if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
624	jbase = `0`;
625	if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
626	j = jiffies - `1`;
627	gpstart = READ_ONCE(sup->srcu_gp_start);
628	if (time_after(j, gpstart))
629	jbase += j - gpstart;
630	if (!jbase) {
631	WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + `1`);
632	if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
633	jbase = `1`;
634	}
635	}
636	return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
637	}
638
639	/**
640	* cleanup_srcu_struct - deconstruct a sleep-RCU structure
641	* @ssp: structure to clean up.
642	*
643	* Must invoke this after you are finished using a given srcu_struct that
644	* was initialized via init_srcu_struct(), else you leak memory.
645	*/
646	void cleanup_srcu_struct(struct srcu_struct *ssp)
647	{
648	int cpu;
649	struct srcu_usage *sup = ssp->srcu_sup;
650
651	if (WARN_ON(!srcu_get_delay(ssp)))
652	return; / Just leak it! /
653	if (WARN_ON(srcu_readers_active(ssp)))
654	return; / Just leak it! /
655	flush_delayed_work(dwork: &sup->work);
656	for_each_possible_cpu(cpu) {
657	struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
658
659	del_timer_sync(timer: &sdp->delay_work);
660	flush_work(work: &sdp->work);
661	if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
662	return; / Forgot srcu_barrier(), so just leak it! /
663	}
664	if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) \|\|
665	WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) \|\|
666	WARN_ON(srcu_readers_active(ssp))) {
667	pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
668	__func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
669	rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
670	return; / Caller forgot to stop doing call_srcu()? /
671	}
672	kfree(objp: sup->node);
673	sup->node = NULL;
674	sup->srcu_size_state = SRCU_SIZE_SMALL;
675	if (!sup->sda_is_static) {
676	free_percpu(pdata: ssp->sda);
677	ssp->sda = NULL;
678	kfree(objp: sup);
679	ssp->srcu_sup = NULL;
680	}
681	}
682	EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
683
684	#ifdef CONFIG_PROVE_RCU
685	/*
686	* Check for consistent NMI safety.
687	*/
688	void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe)
689	{
690	int nmi_safe_mask = `1` << nmi_safe;
691	int old_nmi_safe_mask;
692	struct srcu_data *sdp;
693
694	/ NMI-unsafe use in NMI is a bad sign /
695	WARN_ON_ONCE(!nmi_safe && in_nmi());
696	sdp = raw_cpu_ptr(ssp->sda);
697	old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety);
698	if (!old_nmi_safe_mask) {
699	WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask);
700	return;
701	}
702	WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask);
703	}
704	EXPORT_SYMBOL_GPL(srcu_check_nmi_safety);
705	#endif /* CONFIG_PROVE_RCU */
706
707	/*
708	* Counts the new reader in the appropriate per-CPU element of the
709	* srcu_struct.
710	* Returns an index that must be passed to the matching srcu_read_unlock().
711	*/
712	int __srcu_read_lock(struct srcu_struct *ssp)
713	{
714	int idx;
715
716	idx = READ_ONCE(ssp->srcu_idx) & `0x1`;
717	this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
718	smp_mb(); / B / / Avoid leaking the critical section. /
719	return idx;
720	}
721	EXPORT_SYMBOL_GPL(__srcu_read_lock);
722
723	/*
724	* Removes the count for the old reader from the appropriate per-CPU
725	* element of the srcu_struct. Note that this may well be a different
726	* CPU than that which was incremented by the corresponding srcu_read_lock().
727	*/
728	void __srcu_read_unlock(struct srcu_struct ssp, int* idx)
729	{
730	smp_mb(); / C / / Avoid leaking the critical section. /
731	this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
732	}
733	EXPORT_SYMBOL_GPL(__srcu_read_unlock);
734
735	#ifdef CONFIG_NEED_SRCU_NMI_SAFE
736
737	/*
738	* Counts the new reader in the appropriate per-CPU element of the
739	* srcu_struct, but in an NMI-safe manner using RMW atomics.
740	* Returns an index that must be passed to the matching srcu_read_unlock().
741	*/
742	int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
743	{
744	int idx;
745	struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
746
747	idx = READ_ONCE(ssp->srcu_idx) & `0x1`;
748	atomic_long_inc(&sdp->srcu_lock_count[idx]);
749	smp_mb__after_atomic(); / B / / Avoid leaking the critical section. /
750	return idx;
751	}
752	EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
753
754	/*
755	* Removes the count for the old reader from the appropriate per-CPU
756	* element of the srcu_struct. Note that this may well be a different
757	* CPU than that which was incremented by the corresponding srcu_read_lock().
758	*/
759	void __srcu_read_unlock_nmisafe(struct srcu_struct ssp, int* idx)
760	{
761	struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
762
763	smp_mb__before_atomic(); / C / / Avoid leaking the critical section. /
764	atomic_long_inc(&sdp->srcu_unlock_count[idx]);
765	}
766	EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
767
768	#endif // CONFIG_NEED_SRCU_NMI_SAFE
769
770	/*
771	* Start an SRCU grace period.
772	*/
773	static void srcu_gp_start(struct srcu_struct *ssp)
774	{
775	int state;
776
777	lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
778	WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
779	WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
780	WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, `0`);
781	smp_mb(); / Order prior store to ->srcu_gp_seq_needed vs. GP start. /
782	rcu_seq_start(sp: &ssp->srcu_sup->srcu_gp_seq);
783	state = rcu_seq_state(s: ssp->srcu_sup->srcu_gp_seq);
784	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
785	}
786
787
788	static void srcu_delay_timer(struct timer_list *t)
789	{
790	struct srcu_data sdp = container_of(t, struct* srcu_data, delay_work);
791
792	queue_work_on(cpu: sdp->cpu, wq: rcu_gp_wq, work: &sdp->work);
793	}
794
795	static void srcu_queue_delayed_work_on(struct srcu_data *sdp,
796	unsigned long delay)
797	{
798	if (!delay) {
799	queue_work_on(cpu: sdp->cpu, wq: rcu_gp_wq, work: &sdp->work);
800	return;
801	}
802
803	timer_reduce(timer: &sdp->delay_work, expires: jiffies + delay);
804	}
805
806	/*
807	* Schedule callback invocation for the specified srcu_data structure,
808	* if possible, on the corresponding CPU.
809	*/
810	static void srcu_schedule_cbs_sdp(struct srcu_data sdp, unsigned* long delay)
811	{
812	srcu_queue_delayed_work_on(sdp, delay);
813	}
814
815	/*
816	* Schedule callback invocation for all srcu_data structures associated
817	* with the specified srcu_node structure that have callbacks for the
818	* just-completed grace period, the one corresponding to idx. If possible,
819	* schedule this invocation on the corresponding CPUs.
820	*/
821	static void srcu_schedule_cbs_snp(struct srcu_struct ssp, struct* srcu_node *snp,
822	unsigned long mask, unsigned long delay)
823	{
824	int cpu;
825
826	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
827	if (!(mask & (`1UL` << (cpu - snp->grplo))))
828	continue;
829	srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay);
830	}
831	}
832
833	/*
834	* Note the end of an SRCU grace period. Initiates callback invocation
835	* and starts a new grace period if needed.
836	*
837	* The ->srcu_cb_mutex acquisition does not protect any data, but
838	* instead prevents more than one grace period from starting while we
839	* are initiating callback invocation. This allows the ->srcu_have_cbs[]
840	* array to have a finite number of elements.
841	*/
842	static void srcu_gp_end(struct srcu_struct *ssp)
843	{
844	unsigned long cbdelay = `1`;
845	bool cbs;
846	bool last_lvl;
847	int cpu;
848	unsigned long flags;
849	unsigned long gpseq;
850	int idx;
851	unsigned long mask;
852	struct srcu_data *sdp;
853	unsigned long sgsne;
854	struct srcu_node *snp;
855	int ss_state;
856	struct srcu_usage *sup = ssp->srcu_sup;
857
858	/ Prevent more than one additional grace period. /
859	mutex_lock(&sup->srcu_cb_mutex);
860
861	/ End the current grace period. /
862	spin_lock_irq_rcu_node(sup);
863	idx = rcu_seq_state(s: sup->srcu_gp_seq);
864	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
865	if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
866	cbdelay = `0`;
867
868	WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
869	rcu_seq_end(sp: &sup->srcu_gp_seq);
870	gpseq = rcu_seq_current(sp: &sup->srcu_gp_seq);
871	if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
872	WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
873	spin_unlock_irq_rcu_node(sup);
874	mutex_unlock(lock: &sup->srcu_gp_mutex);
875	/ A new grace period can start at this point. But only one. /
876
877	/ Initiate callback invocation as needed. /
878	ss_state = smp_load_acquire(&sup->srcu_size_state);
879	if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
880	srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
881	delay: cbdelay);
882	} else {
883	idx = rcu_seq_ctr(s: gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
884	srcu_for_each_node_breadth_first(ssp, snp) {
885	spin_lock_irq_rcu_node(snp);
886	cbs = false;
887	last_lvl = snp >= sup->level[rcu_num_lvls - `1`];
888	if (last_lvl)
889	cbs = ss_state < SRCU_SIZE_BIG \|\| snp->srcu_have_cbs[idx] == gpseq;
890	snp->srcu_have_cbs[idx] = gpseq;
891	rcu_seq_set_state(sp: &snp->srcu_have_cbs[idx], newstate: `1`);
892	sgsne = snp->srcu_gp_seq_needed_exp;
893	if (srcu_invl_snp_seq(s: sgsne) \|\| ULONG_CMP_LT(sgsne, gpseq))
894	WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
895	if (ss_state < SRCU_SIZE_BIG)
896	mask = ~`0`;
897	else
898	mask = snp->srcu_data_have_cbs[idx];
899	snp->srcu_data_have_cbs[idx] = `0`;
900	spin_unlock_irq_rcu_node(snp);
901	if (cbs)
902	srcu_schedule_cbs_snp(ssp, snp, mask, delay: cbdelay);
903	}
904	}
905
906	/ Occasionally prevent srcu_data counter wrap. /
907	if (!(gpseq & counter_wrap_check))
908	for_each_possible_cpu(cpu) {
909	sdp = per_cpu_ptr(ssp->sda, cpu);
910	spin_lock_irqsave_rcu_node(sdp, flags);
911	if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + `100`))
912	sdp->srcu_gp_seq_needed = gpseq;
913	if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + `100`))
914	sdp->srcu_gp_seq_needed_exp = gpseq;
915	spin_unlock_irqrestore_rcu_node(sdp, flags);
916	}
917
918	/ Callback initiation done, allow grace periods after next. /
919	mutex_unlock(lock: &sup->srcu_cb_mutex);
920
921	/ Start a new grace period if needed. /
922	spin_lock_irq_rcu_node(sup);
923	gpseq = rcu_seq_current(sp: &sup->srcu_gp_seq);
924	if (!rcu_seq_state(s: gpseq) &&
925	ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
926	srcu_gp_start(ssp);
927	spin_unlock_irq_rcu_node(sup);
928	srcu_reschedule(ssp, delay: `0`);
929	} else {
930	spin_unlock_irq_rcu_node(sup);
931	}
932
933	/ Transition to big if needed. /
934	if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) {
935	if (ss_state == SRCU_SIZE_ALLOC)
936	init_srcu_struct_nodes(ssp, GFP_KERNEL);
937	else
938	smp_store_release(&sup->srcu_size_state, ss_state + `1`);
939	}
940	}
941
942	/*
943	* Funnel-locking scheme to scalably mediate many concurrent expedited
944	* grace-period requests. This function is invoked for the first known
945	* expedited request for a grace period that has already been requested,
946	* but without expediting. To start a completely new grace period,
947	* whether expedited or not, use srcu_funnel_gp_start() instead.
948	*/
949	static void srcu_funnel_exp_start(struct srcu_struct ssp, struct* srcu_node *snp,
950	unsigned long s)
951	{
952	unsigned long flags;
953	unsigned long sgsne;
954
955	if (snp)
956	for (; snp != NULL; snp = snp->srcu_parent) {
957	sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
958	if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) \|\|
959	(!srcu_invl_snp_seq(s: sgsne) && ULONG_CMP_GE(sgsne, s)))
960	return;
961	spin_lock_irqsave_rcu_node(snp, flags);
962	sgsne = snp->srcu_gp_seq_needed_exp;
963	if (!srcu_invl_snp_seq(s: sgsne) && ULONG_CMP_GE(sgsne, s)) {
964	spin_unlock_irqrestore_rcu_node(snp, flags);
965	return;
966	}
967	WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
968	spin_unlock_irqrestore_rcu_node(snp, flags);
969	}
970	spin_lock_irqsave_ssp_contention(ssp, flags: &flags);
971	if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
972	WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
973	spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
974	}
975
976	/*
977	* Funnel-locking scheme to scalably mediate many concurrent grace-period
978	* requests. The winner has to do the work of actually starting grace
979	* period s. Losers must either ensure that their desired grace-period
980	* number is recorded on at least their leaf srcu_node structure, or they
981	* must take steps to invoke their own callbacks.
982	*
983	* Note that this function also does the work of srcu_funnel_exp_start(),
984	* in some cases by directly invoking it.
985	*
986	* The srcu read lock should be hold around this function. And s is a seq snap
987	* after holding that lock.
988	*/
989	static void srcu_funnel_gp_start(struct srcu_struct ssp, struct* srcu_data *sdp,
990	unsigned long s, bool do_norm)
991	{
992	unsigned long flags;
993	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
994	unsigned long sgsne;
995	struct srcu_node *snp;
996	struct srcu_node *snp_leaf;
997	unsigned long snp_seq;
998	struct srcu_usage *sup = ssp->srcu_sup;
999
1000	/ Ensure that snp node tree is fully initialized before traversing it /
1001	if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
1002	snp_leaf = NULL;
1003	else
1004	snp_leaf = sdp->mynode;
1005
1006	if (snp_leaf)
1007	/ Each pass through the loop does one level of the srcu_node tree. /
1008	for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
1009	if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
1010	return; / GP already done and CBs recorded. /
1011	spin_lock_irqsave_rcu_node(snp, flags);
1012	snp_seq = snp->srcu_have_cbs[idx];
1013	if (!srcu_invl_snp_seq(s: snp_seq) && ULONG_CMP_GE(snp_seq, s)) {
1014	if (snp == snp_leaf && snp_seq == s)
1015	snp->srcu_data_have_cbs[idx] \|= sdp->grpmask;
1016	spin_unlock_irqrestore_rcu_node(snp, flags);
1017	if (snp == snp_leaf && snp_seq != s) {
1018	srcu_schedule_cbs_sdp(sdp, delay: do_norm ? SRCU_INTERVAL : `0`);
1019	return;
1020	}
1021	if (!do_norm)
1022	srcu_funnel_exp_start(ssp, snp, s);
1023	return;
1024	}
1025	snp->srcu_have_cbs[idx] = s;
1026	if (snp == snp_leaf)
1027	snp->srcu_data_have_cbs[idx] \|= sdp->grpmask;
1028	sgsne = snp->srcu_gp_seq_needed_exp;
1029	if (!do_norm && (srcu_invl_snp_seq(s: sgsne) \|\| ULONG_CMP_LT(sgsne, s)))
1030	WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
1031	spin_unlock_irqrestore_rcu_node(snp, flags);
1032	}
1033
1034	/ Top of tree, must ensure the grace period will be started. /
1035	spin_lock_irqsave_ssp_contention(ssp, flags: &flags);
1036	if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
1037	/*
1038	* Record need for grace period s. Pair with load
1039	* acquire setting up for initialization.
1040	*/
1041	smp_store_release(&sup->srcu_gp_seq_needed, s); /^^^/
1042	}
1043	if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
1044	WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
1045
1046	/ If grace period not already in progress, start it. /
1047	if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
1048	rcu_seq_state(s: sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
1049	WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
1050	srcu_gp_start(ssp);
1051
1052	// And how can that list_add() in the "else" clause
1053	// possibly be safe for concurrent execution? Well,
1054	// it isn't. And it does not have to be. After all, it
1055	// can only be executed during early boot when there is only
1056	// the one boot CPU running with interrupts still disabled.
1057	if (likely(srcu_init_done))
1058	queue_delayed_work(wq: rcu_gp_wq, dwork: &sup->work,
1059	delay: !!srcu_get_delay(ssp));
1060	else if (list_empty(head: &sup->work.work.entry))
1061	list_add(new: &sup->work.work.entry, head: &srcu_boot_list);
1062	}
1063	spin_unlock_irqrestore_rcu_node(sup, flags);
1064	}
1065
1066	/*
1067	* Wait until all readers counted by array index idx complete, but
1068	* loop an additional time if there is an expedited grace period pending.
1069	* The caller must ensure that ->srcu_idx is not changed while checking.
1070	*/
1071	static bool try_check_zero(struct srcu_struct ssp, int* idx, int trycount)
1072	{
1073	unsigned long curdelay;
1074
1075	curdelay = !srcu_get_delay(ssp);
1076
1077	for (;;) {
1078	if (srcu_readers_active_idx_check(ssp, idx))
1079	return true;
1080	if ((--trycount + curdelay) <= `0`)
1081	return false;
1082	udelay(srcu_retry_check_delay);
1083	}
1084	}
1085
1086	/*
1087	* Increment the ->srcu_idx counter so that future SRCU readers will
1088	* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
1089	* us to wait for pre-existing readers in a starvation-free manner.
1090	*/
1091	static void srcu_flip(struct srcu_struct *ssp)
1092	{
1093	/*
1094	* Because the flip of ->srcu_idx is executed only if the
1095	* preceding call to srcu_readers_active_idx_check() found that
1096	* the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
1097	* and because that summing uses atomic_long_read(), there is
1098	* ordering due to a control dependency between that summing and
1099	* the WRITE_ONCE() in this call to srcu_flip(). This ordering
1100	* ensures that if this updater saw a given reader's increment from
1101	* __srcu_read_lock(), that reader was using a value of ->srcu_idx
1102	* from before the previous call to srcu_flip(), which should be
1103	* quite rare. This ordering thus helps forward progress because
1104	* the grace period could otherwise be delayed by additional
1105	* calls to __srcu_read_lock() using that old (soon to be new)
1106	* value of ->srcu_idx.
1107	*
1108	* This sum-equality check and ordering also ensures that if
1109	* a given call to __srcu_read_lock() uses the new value of
1110	* ->srcu_idx, this updater's earlier scans cannot have seen
1111	* that reader's increments, which is all to the good, because
1112	* this grace period need not wait on that reader. After all,
1113	* if those earlier scans had seen that reader, there would have
1114	* been a sum mismatch and this code would not be reached.
1115	*
1116	* This means that the following smp_mb() is redundant, but
1117	* it stays until either (1) Compilers learn about this sort of
1118	* control dependency or (2) Some production workload running on
1119	* a production system is unduly delayed by this slowpath smp_mb().
1120	*/
1121	smp_mb(); / E / / Pairs with B and C. /
1122
1123	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + `1`); // Flip the counter.
1124
1125	/*
1126	* Ensure that if the updater misses an __srcu_read_unlock()
1127	* increment, that task's __srcu_read_lock() following its next
1128	* __srcu_read_lock() or __srcu_read_unlock() will see the above
1129	* counter update. Note that both this memory barrier and the
1130	* one in srcu_readers_active_idx_check() provide the guarantee
1131	* for __srcu_read_lock().
1132	*/
1133	smp_mb(); / D / / Pairs with C. /
1134	}
1135
1136	/*
1137	* If SRCU is likely idle, return true, otherwise return false.
1138	*
1139	* Note that it is OK for several current from-idle requests for a new
1140	* grace period from idle to specify expediting because they will all end
1141	* up requesting the same grace period anyhow. So no loss.
1142	*
1143	* Note also that if any CPU (including the current one) is still invoking
1144	* callbacks, this function will nevertheless say "idle". This is not
1145	* ideal, but the overhead of checking all CPUs' callback lists is even
1146	* less ideal, especially on large systems. Furthermore, the wakeup
1147	* can happen before the callback is fully removed, so we have no choice
1148	* but to accept this type of error.
1149	*
1150	* This function is also subject to counter-wrap errors, but let's face
1151	* it, if this function was preempted for enough time for the counters
1152	* to wrap, it really doesn't matter whether or not we expedite the grace
1153	* period. The extra overhead of a needlessly expedited grace period is
1154	* negligible when amortized over that time period, and the extra latency
1155	* of a needlessly non-expedited grace period is similarly negligible.
1156	*/
1157	static bool srcu_might_be_idle(struct srcu_struct *ssp)
1158	{
1159	unsigned long curseq;
1160	unsigned long flags;
1161	struct srcu_data *sdp;
1162	unsigned long t;
1163	unsigned long tlast;
1164
1165	check_init_srcu_struct(ssp);
1166	/ If the local srcu_data structure has callbacks, not idle. /
1167	sdp = raw_cpu_ptr(ssp->sda);
1168	spin_lock_irqsave_rcu_node(sdp, flags);
1169	if (rcu_segcblist_pend_cbs(rsclp: &sdp->srcu_cblist)) {
1170	spin_unlock_irqrestore_rcu_node(sdp, flags);
1171	return false; / Callbacks already present, so not idle. /
1172	}
1173	spin_unlock_irqrestore_rcu_node(sdp, flags);
1174
1175	/*
1176	* No local callbacks, so probabilistically probe global state.
1177	* Exact information would require acquiring locks, which would
1178	* kill scalability, hence the probabilistic nature of the probe.
1179	*/
1180
1181	/ First, see if enough time has passed since the last GP. /
1182	t = ktime_get_mono_fast_ns();
1183	tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
1184	if (exp_holdoff == `0` \|\|
1185	time_in_range_open(t, tlast, tlast + exp_holdoff))
1186	return false; / Too soon after last GP. /
1187
1188	/ Next, check for probable idleness. /
1189	curseq = rcu_seq_current(sp: &ssp->srcu_sup->srcu_gp_seq);
1190	smp_mb(); / Order ->srcu_gp_seq with ->srcu_gp_seq_needed. /
1191	if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
1192	return false; / Grace period in progress, so not idle. /
1193	smp_mb(); / Order ->srcu_gp_seq with prior access. /
1194	if (curseq != rcu_seq_current(sp: &ssp->srcu_sup->srcu_gp_seq))
1195	return false; / GP # changed, so not idle. /
1196	return true; / With reasonable probability, idle! /
1197	}
1198
1199	/*
1200	* SRCU callback function to leak a callback.
1201	*/
1202	static void srcu_leak_callback(struct rcu_head *rhp)
1203	{
1204	}
1205
1206	/*
1207	* Start an SRCU grace period, and also queue the callback if non-NULL.
1208	*/
1209	static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
1210	struct rcu_head *rhp, bool do_norm)
1211	{
1212	unsigned long flags;
1213	int idx;
1214	bool needexp = false;
1215	bool needgp = false;
1216	unsigned long s;
1217	struct srcu_data *sdp;
1218	struct srcu_node *sdp_mynode;
1219	int ss_state;
1220
1221	check_init_srcu_struct(ssp);
1222	/*
1223	* While starting a new grace period, make sure we are in an
1224	* SRCU read-side critical section so that the grace-period
1225	* sequence number cannot wrap around in the meantime.
1226	*/
1227	idx = __srcu_read_lock_nmisafe(ssp);
1228	ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
1229	if (ss_state < SRCU_SIZE_WAIT_CALL)
1230	sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
1231	else
1232	sdp = raw_cpu_ptr(ssp->sda);
1233	spin_lock_irqsave_sdp_contention(sdp, flags: &flags);
1234	if (rhp)
1235	rcu_segcblist_enqueue(rsclp: &sdp->srcu_cblist, rhp);
1236	/*
1237	* It's crucial to capture the snapshot 's' for acceleration before
1238	* reading the current gp_seq that is used for advancing. This is
1239	* essential because if the acceleration snapshot is taken after a
1240	* failed advancement attempt, there's a risk that a grace period may
1241	* conclude and a new one may start in the interim. If the snapshot is
1242	* captured after this sequence of events, the acceleration snapshot 's'
1243	* could be excessively advanced, leading to acceleration failure.
1244	* In such a scenario, an 'acceleration leak' can occur, where new
1245	* callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment.
1246	* Also note that encountering advancing failures is a normal
1247	* occurrence when the grace period for RCU_WAIT_TAIL is in progress.
1248	*
1249	* To see this, consider the following events which occur if
1250	* rcu_seq_snap() were to be called after advance:
1251	*
1252	* 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
1253	* RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
1254	*
1255	* 2) The grace period for RCU_WAIT_TAIL is seen as started but not
1256	* completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
1257	*
1258	* 3) This value is passed to rcu_segcblist_advance() which can't move
1259	* any segment forward and fails.
1260	*
1261	* 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
1262	* But then the call to rcu_seq_snap() observes the grace period for the
1263	* RCU_WAIT_TAIL segment as completed and the subsequent one for the
1264	* RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
1265	* so it returns a snapshot of the next grace period, which is X + 12.
1266	*
1267	* 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
1268	* freshly enqueued callback in RCU_NEXT_TAIL can't move to
1269	* RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
1270	* period (gp_num = X + 8). So acceleration fails.
1271	*/
1272	s = rcu_seq_snap(sp: &ssp->srcu_sup->srcu_gp_seq);
1273	if (rhp) {
1274	rcu_segcblist_advance(rsclp: &sdp->srcu_cblist,
1275	seq: rcu_seq_current(sp: &ssp->srcu_sup->srcu_gp_seq));
1276	/*
1277	* Acceleration can never fail because the base current gp_seq
1278	* used for acceleration is <= the value of gp_seq used for
1279	* advancing. This means that RCU_NEXT_TAIL segment will
1280	* always be able to be emptied by the acceleration into the
1281	* RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments.
1282	*/
1283	WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s));
1284	}
1285	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
1286	sdp->srcu_gp_seq_needed = s;
1287	needgp = true;
1288	}
1289	if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
1290	sdp->srcu_gp_seq_needed_exp = s;
1291	needexp = true;
1292	}
1293	spin_unlock_irqrestore_rcu_node(sdp, flags);
1294
1295	/ Ensure that snp node tree is fully initialized before traversing it /
1296	if (ss_state < SRCU_SIZE_WAIT_BARRIER)
1297	sdp_mynode = NULL;
1298	else
1299	sdp_mynode = sdp->mynode;
1300
1301	if (needgp)
1302	srcu_funnel_gp_start(ssp, sdp, s, do_norm);
1303	else if (needexp)
1304	srcu_funnel_exp_start(ssp, snp: sdp_mynode, s);
1305	__srcu_read_unlock_nmisafe(ssp, idx);
1306	return s;
1307	}
1308
1309	/*
1310	* Enqueue an SRCU callback on the srcu_data structure associated with
1311	* the current CPU and the specified srcu_struct structure, initiating
1312	* grace-period processing if it is not already running.
1313	*
1314	* Note that all CPUs must agree that the grace period extended beyond
1315	* all pre-existing SRCU read-side critical section. On systems with
1316	* more than one CPU, this means that when "func()" is invoked, each CPU
1317	* is guaranteed to have executed a full memory barrier since the end of
1318	* its last corresponding SRCU read-side critical section whose beginning
1319	* preceded the call to call_srcu(). It also means that each CPU executing
1320	* an SRCU read-side critical section that continues beyond the start of
1321	* "func()" must have executed a memory barrier after the call_srcu()
1322	* but before the beginning of that SRCU read-side critical section.
1323	* Note that these guarantees include CPUs that are offline, idle, or
1324	* executing in user mode, as well as CPUs that are executing in the kernel.
1325	*
1326	* Furthermore, if CPU A invoked call_srcu() and CPU B invoked the
1327	* resulting SRCU callback function "func()", then both CPU A and CPU
1328	* B are guaranteed to execute a full memory barrier during the time
1329	* interval between the call to call_srcu() and the invocation of "func()".
1330	* This guarantee applies even if CPU A and CPU B are the same CPU (but
1331	* again only if the system has more than one CPU).
1332	*
1333	* Of course, these guarantees apply only for invocations of call_srcu(),
1334	* srcu_read_lock(), and srcu_read_unlock() that are all passed the same
1335	* srcu_struct structure.
1336	*/
1337	static void __call_srcu(struct srcu_struct ssp, struct* rcu_head *rhp,
1338	rcu_callback_t func, bool do_norm)
1339	{
1340	if (debug_rcu_head_queue(head: rhp)) {
1341	/ Probable double call_srcu(), so leak the callback. /
1342	WRITE_ONCE(rhp->func, srcu_leak_callback);
1343	WARN_ONCE(`1`, "call_srcu(): Leaked duplicate callback\n");
1344	return;
1345	}
1346	rhp->func = func;
1347	(void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
1348	}
1349
1350	/**
1351	* call_srcu() - Queue a callback for invocation after an SRCU grace period
1352	* @ssp: srcu_struct in queue the callback
1353	* @rhp: structure to be used for queueing the SRCU callback.
1354	* @func: function to be invoked after the SRCU grace period
1355	*
1356	* The callback function will be invoked some time after a full SRCU
1357	* grace period elapses, in other words after all pre-existing SRCU
1358	* read-side critical sections have completed. However, the callback
1359	* function might well execute concurrently with other SRCU read-side
1360	* critical sections that started after call_srcu() was invoked. SRCU
1361	* read-side critical sections are delimited by srcu_read_lock() and
1362	* srcu_read_unlock(), and may be nested.
1363	*
1364	* The callback will be invoked from process context, but must nevertheless
1365	* be fast and must not block.
1366	*/
1367	void call_srcu(struct srcu_struct ssp, struct* rcu_head *rhp,
1368	rcu_callback_t func)
1369	{
1370	__call_srcu(ssp, rhp, func, do_norm: true);
1371	}
1372	EXPORT_SYMBOL_GPL(call_srcu);
1373
1374	/*
1375	* Helper function for synchronize_srcu() and synchronize_srcu_expedited().
1376	*/
1377	static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
1378	{
1379	struct rcu_synchronize rcu;
1380
1381	srcu_lock_sync(map: &ssp->dep_map);
1382
1383	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) \|\|
1384	lock_is_held(&rcu_bh_lock_map) \|\|
1385	lock_is_held(&rcu_lock_map) \|\|
1386	lock_is_held(&rcu_sched_lock_map),
1387	"Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
1388
1389	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
1390	return;
1391	might_sleep();
1392	check_init_srcu_struct(ssp);
1393	init_completion(x: &rcu.completion);
1394	init_rcu_head_on_stack(head: &rcu.head);
1395	__call_srcu(ssp, rhp: &rcu.head, func: wakeme_after_rcu, do_norm);
1396	wait_for_completion(&rcu.completion);
1397	destroy_rcu_head_on_stack(head: &rcu.head);
1398
1399	/*
1400	* Make sure that later code is ordered after the SRCU grace
1401	* period. This pairs with the spin_lock_irq_rcu_node()
1402	* in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
1403	* because the current CPU might have been totally uninvolved with
1404	* (and thus unordered against) that grace period.
1405	*/
1406	smp_mb();
1407	}
1408
1409	/**
1410	* synchronize_srcu_expedited - Brute-force SRCU grace period
1411	* @ssp: srcu_struct with which to synchronize.
1412	*
1413	* Wait for an SRCU grace period to elapse, but be more aggressive about
1414	* spinning rather than blocking when waiting.
1415	*
1416	* Note that synchronize_srcu_expedited() has the same deadlock and
1417	* memory-ordering properties as does synchronize_srcu().
1418	*/
1419	void synchronize_srcu_expedited(struct srcu_struct *ssp)
1420	{
1421	__synchronize_srcu(ssp, do_norm: rcu_gp_is_normal());
1422	}
1423	EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
1424
1425	/**
1426	* synchronize_srcu - wait for prior SRCU read-side critical-section completion
1427	* @ssp: srcu_struct with which to synchronize.
1428	*
1429	* Wait for the count to drain to zero of both indexes. To avoid the
1430	* possible starvation of synchronize_srcu(), it waits for the count of
1431	* the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
1432	* and then flip the srcu_idx and wait for the count of the other index.
1433	*
1434	* Can block; must be called from process context.
1435	*
1436	* Note that it is illegal to call synchronize_srcu() from the corresponding
1437	* SRCU read-side critical section; doing so will result in deadlock.
1438	* However, it is perfectly legal to call synchronize_srcu() on one
1439	* srcu_struct from some other srcu_struct's read-side critical section,
1440	* as long as the resulting graph of srcu_structs is acyclic.
1441	*
1442	* There are memory-ordering constraints implied by synchronize_srcu().
1443	* On systems with more than one CPU, when synchronize_srcu() returns,
1444	* each CPU is guaranteed to have executed a full memory barrier since
1445	* the end of its last corresponding SRCU read-side critical section
1446	* whose beginning preceded the call to synchronize_srcu(). In addition,
1447	* each CPU having an SRCU read-side critical section that extends beyond
1448	* the return from synchronize_srcu() is guaranteed to have executed a
1449	* full memory barrier after the beginning of synchronize_srcu() and before
1450	* the beginning of that SRCU read-side critical section. Note that these
1451	* guarantees include CPUs that are offline, idle, or executing in user mode,
1452	* as well as CPUs that are executing in the kernel.
1453	*
1454	* Furthermore, if CPU A invoked synchronize_srcu(), which returned
1455	* to its caller on CPU B, then both CPU A and CPU B are guaranteed
1456	* to have executed a full memory barrier during the execution of
1457	* synchronize_srcu(). This guarantee applies even if CPU A and CPU B
1458	* are the same CPU, but again only if the system has more than one CPU.
1459	*
1460	* Of course, these memory-ordering guarantees apply only when
1461	* synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
1462	* passed the same srcu_struct structure.
1463	*
1464	* Implementation of these memory-ordering guarantees is similar to
1465	* that of synchronize_rcu().
1466	*
1467	* If SRCU is likely idle, expedite the first request. This semantic
1468	* was provided by Classic SRCU, and is relied upon by its users, so TREE
1469	* SRCU must also provide it. Note that detecting idleness is heuristic
1470	* and subject to both false positives and negatives.
1471	*/
1472	void synchronize_srcu(struct srcu_struct *ssp)
1473	{
1474	if (srcu_might_be_idle(ssp) \|\| rcu_gp_is_expedited())
1475	synchronize_srcu_expedited(ssp);
1476	else
1477	__synchronize_srcu(ssp, do_norm: true);
1478	}
1479	EXPORT_SYMBOL_GPL(synchronize_srcu);
1480
1481	/**
1482	* get_state_synchronize_srcu - Provide an end-of-grace-period cookie
1483	* @ssp: srcu_struct to provide cookie for.
1484	*
1485	* This function returns a cookie that can be passed to
1486	* poll_state_synchronize_srcu(), which will return true if a full grace
1487	* period has elapsed in the meantime. It is the caller's responsibility
1488	* to make sure that grace period happens, for example, by invoking
1489	* call_srcu() after return from get_state_synchronize_srcu().
1490	*/
1491	unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
1492	{
1493	// Any prior manipulation of SRCU-protected data must happen
1494	// before the load from ->srcu_gp_seq.
1495	smp_mb();
1496	return rcu_seq_snap(sp: &ssp->srcu_sup->srcu_gp_seq);
1497	}
1498	EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
1499
1500	/**
1501	* start_poll_synchronize_srcu - Provide cookie and start grace period
1502	* @ssp: srcu_struct to provide cookie for.
1503	*
1504	* This function returns a cookie that can be passed to
1505	* poll_state_synchronize_srcu(), which will return true if a full grace
1506	* period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
1507	* this function also ensures that any needed SRCU grace period will be
1508	* started. This convenience does come at a cost in terms of CPU overhead.
1509	*/
1510	unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
1511	{
1512	return srcu_gp_start_if_needed(ssp, NULL, do_norm: true);
1513	}
1514	EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
1515
1516	/**
1517	* poll_state_synchronize_srcu - Has cookie's grace period ended?
1518	* @ssp: srcu_struct to provide cookie for.
1519	* @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
1520	*
1521	* This function takes the cookie that was returned from either
1522	* get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
1523	* returns @true if an SRCU grace period elapsed since the time that the
1524	* cookie was created.
1525	*
1526	* Because cookies are finite in size, wrapping/overflow is possible.
1527	* This is more pronounced on 32-bit systems where cookies are 32 bits,
1528	* where in theory wrapping could happen in about 14 hours assuming
1529	* 25-microsecond expedited SRCU grace periods. However, a more likely
1530	* overflow lower bound is on the order of 24 days in the case of
1531	* one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit
1532	* system requires geologic timespans, as in more than seven million years
1533	* even for expedited SRCU grace periods.
1534	*
1535	* Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems
1536	* that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses
1537	* a 16-bit cookie, which rcutorture routinely wraps in a matter of a
1538	* few minutes. If this proves to be a problem, this counter will be
1539	* expanded to the same size as for Tree SRCU.
1540	*/
1541	bool poll_state_synchronize_srcu(struct srcu_struct ssp, unsigned* long cookie)
1542	{
1543	if (!rcu_seq_done(sp: &ssp->srcu_sup->srcu_gp_seq, s: cookie))
1544	return false;
1545	// Ensure that the end of the SRCU grace period happens before
1546	// any subsequent code that the caller might execute.
1547	smp_mb(); // ^^^
1548	return true;
1549	}
1550	EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
1551
1552	/*
1553	* Callback function for srcu_barrier() use.
1554	*/
1555	static void srcu_barrier_cb(struct rcu_head *rhp)
1556	{
1557	struct srcu_data *sdp;
1558	struct srcu_struct *ssp;
1559
1560	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
1561	ssp = sdp->ssp;
1562	if (atomic_dec_and_test(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt))
1563	complete(&ssp->srcu_sup->srcu_barrier_completion);
1564	}
1565
1566	/*
1567	* Enqueue an srcu_barrier() callback on the specified srcu_data
1568	* structure's ->cblist. but only if that ->cblist already has at least one
1569	* callback enqueued. Note that if a CPU already has callbacks enqueue,
1570	* it must have already registered the need for a future grace period,
1571	* so all we need do is enqueue a callback that will use the same grace
1572	* period as the last callback already in the queue.
1573	*/
1574	static void srcu_barrier_one_cpu(struct srcu_struct ssp, struct* srcu_data *sdp)
1575	{
1576	spin_lock_irq_rcu_node(sdp);
1577	atomic_inc(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt);
1578	sdp->srcu_barrier_head.func = srcu_barrier_cb;
1579	debug_rcu_head_queue(head: &sdp->srcu_barrier_head);
1580	if (!rcu_segcblist_entrain(rsclp: &sdp->srcu_cblist,
1581	rhp: &sdp->srcu_barrier_head)) {
1582	debug_rcu_head_unqueue(head: &sdp->srcu_barrier_head);
1583	atomic_dec(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt);
1584	}
1585	spin_unlock_irq_rcu_node(sdp);
1586	}
1587
1588	/**
1589	* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
1590	* @ssp: srcu_struct on which to wait for in-flight callbacks.
1591	*/
1592	void srcu_barrier(struct srcu_struct *ssp)
1593	{
1594	int cpu;
1595	int idx;
1596	unsigned long s = rcu_seq_snap(sp: &ssp->srcu_sup->srcu_barrier_seq);
1597
1598	check_init_srcu_struct(ssp);
1599	mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
1600	if (rcu_seq_done(sp: &ssp->srcu_sup->srcu_barrier_seq, s)) {
1601	smp_mb(); / Force ordering following return. /
1602	mutex_unlock(lock: &ssp->srcu_sup->srcu_barrier_mutex);
1603	return; / Someone else did our work for us. /
1604	}
1605	rcu_seq_start(sp: &ssp->srcu_sup->srcu_barrier_seq);
1606	init_completion(x: &ssp->srcu_sup->srcu_barrier_completion);
1607
1608	/ Initial count prevents reaching zero until all CBs are posted. /
1609	atomic_set(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt, i: `1`);
1610
1611	idx = __srcu_read_lock_nmisafe(ssp);
1612	if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
1613	srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
1614	else
1615	for_each_possible_cpu(cpu)
1616	srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu));
1617	__srcu_read_unlock_nmisafe(ssp, idx);
1618
1619	/ Remove the initial count, at which point reaching zero can happen. /
1620	if (atomic_dec_and_test(v: &ssp->srcu_sup->srcu_barrier_cpu_cnt))
1621	complete(&ssp->srcu_sup->srcu_barrier_completion);
1622	wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
1623
1624	rcu_seq_end(sp: &ssp->srcu_sup->srcu_barrier_seq);
1625	mutex_unlock(lock: &ssp->srcu_sup->srcu_barrier_mutex);
1626	}
1627	EXPORT_SYMBOL_GPL(srcu_barrier);
1628
1629	/**
1630	* srcu_batches_completed - return batches completed.
1631	* @ssp: srcu_struct on which to report batch completion.
1632	*
1633	* Report the number of batches, correlated with, but not necessarily
1634	* precisely the same as, the number of grace periods that have elapsed.
1635	*/
1636	unsigned long srcu_batches_completed(struct srcu_struct *ssp)
1637	{
1638	return READ_ONCE(ssp->srcu_idx);
1639	}
1640	EXPORT_SYMBOL_GPL(srcu_batches_completed);
1641
1642	/*
1643	* Core SRCU state machine. Push state bits of ->srcu_gp_seq
1644	* to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
1645	* completed in that state.
1646	*/
1647	static void srcu_advance_state(struct srcu_struct *ssp)
1648	{
1649	int idx;
1650
1651	mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
1652
1653	/*
1654	* Because readers might be delayed for an extended period after
1655	* fetching ->srcu_idx for their index, at any point in time there
1656	* might well be readers using both idx=0 and idx=1. We therefore
1657	* need to wait for readers to clear from both index values before
1658	* invoking a callback.
1659	*
1660	* The load-acquire ensures that we see the accesses performed
1661	* by the prior grace period.
1662	*/
1663	idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); / ^^^ /
1664	if (idx == SRCU_STATE_IDLE) {
1665	spin_lock_irq_rcu_node(ssp->srcu_sup);
1666	if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
1667	WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
1668	spin_unlock_irq_rcu_node(ssp->srcu_sup);
1669	mutex_unlock(lock: &ssp->srcu_sup->srcu_gp_mutex);
1670	return;
1671	}
1672	idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
1673	if (idx == SRCU_STATE_IDLE)
1674	srcu_gp_start(ssp);
1675	spin_unlock_irq_rcu_node(ssp->srcu_sup);
1676	if (idx != SRCU_STATE_IDLE) {
1677	mutex_unlock(lock: &ssp->srcu_sup->srcu_gp_mutex);
1678	return; / Someone else started the grace period. /
1679	}
1680	}
1681
1682	if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
1683	idx = `1` ^ (ssp->srcu_idx & `1`);
1684	if (!try_check_zero(ssp, idx, trycount: `1`)) {
1685	mutex_unlock(lock: &ssp->srcu_sup->srcu_gp_mutex);
1686	return; / readers present, retry later. /
1687	}
1688	srcu_flip(ssp);
1689	spin_lock_irq_rcu_node(ssp->srcu_sup);
1690	rcu_seq_set_state(sp: &ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
1691	ssp->srcu_sup->srcu_n_exp_nodelay = `0`;
1692	spin_unlock_irq_rcu_node(ssp->srcu_sup);
1693	}
1694
1695	if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
1696
1697	/*
1698	* SRCU read-side critical sections are normally short,
1699	* so check at least twice in quick succession after a flip.
1700	*/
1701	idx = `1` ^ (ssp->srcu_idx & `1`);
1702	if (!try_check_zero(ssp, idx, trycount: `2`)) {
1703	mutex_unlock(lock: &ssp->srcu_sup->srcu_gp_mutex);
1704	return; / readers present, retry later. /
1705	}
1706	ssp->srcu_sup->srcu_n_exp_nodelay = `0`;
1707	srcu_gp_end(ssp); / Releases ->srcu_gp_mutex. /
1708	}
1709	}
1710
1711	/*
1712	* Invoke a limited number of SRCU callbacks that have passed through
1713	* their grace period. If there are more to do, SRCU will reschedule
1714	* the workqueue. Note that needed memory barriers have been executed
1715	* in this task's context by srcu_readers_active_idx_check().
1716	*/
1717	static void srcu_invoke_callbacks(struct work_struct *work)
1718	{
1719	long len;
1720	bool more;
1721	struct rcu_cblist ready_cbs;
1722	struct rcu_head *rhp;
1723	struct srcu_data *sdp;
1724	struct srcu_struct *ssp;
1725
1726	sdp = container_of(work, struct srcu_data, work);
1727
1728	ssp = sdp->ssp;
1729	rcu_cblist_init(rclp: &ready_cbs);
1730	spin_lock_irq_rcu_node(sdp);
1731	WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL));
1732	rcu_segcblist_advance(rsclp: &sdp->srcu_cblist,
1733	seq: rcu_seq_current(sp: &ssp->srcu_sup->srcu_gp_seq));
1734	/*
1735	* Although this function is theoretically re-entrant, concurrent
1736	* callbacks invocation is disallowed to avoid executing an SRCU barrier
1737	* too early.
1738	*/
1739	if (sdp->srcu_cblist_invoking \|\|
1740	!rcu_segcblist_ready_cbs(rsclp: &sdp->srcu_cblist)) {
1741	spin_unlock_irq_rcu_node(sdp);
1742	return; / Someone else on the job or nothing to do. /
1743	}
1744
1745	/ We are on the job! Extract and invoke ready callbacks. /
1746	sdp->srcu_cblist_invoking = true;
1747	rcu_segcblist_extract_done_cbs(rsclp: &sdp->srcu_cblist, rclp: &ready_cbs);
1748	len = ready_cbs.len;
1749	spin_unlock_irq_rcu_node(sdp);
1750	rhp = rcu_cblist_dequeue(rclp: &ready_cbs);
1751	for (; rhp != NULL; rhp = rcu_cblist_dequeue(rclp: &ready_cbs)) {
1752	debug_rcu_head_unqueue(head: rhp);
1753	debug_rcu_head_callback(rhp);
1754	local_bh_disable();
1755	rhp->func(rhp);
1756	local_bh_enable();
1757	}
1758	WARN_ON_ONCE(ready_cbs.len);
1759
1760	/*
1761	* Update counts, accelerate new callbacks, and if needed,
1762	* schedule another round of callback invocation.
1763	*/
1764	spin_lock_irq_rcu_node(sdp);
1765	rcu_segcblist_add_len(rsclp: &sdp->srcu_cblist, v: -len);
1766	sdp->srcu_cblist_invoking = false;
1767	more = rcu_segcblist_ready_cbs(rsclp: &sdp->srcu_cblist);
1768	spin_unlock_irq_rcu_node(sdp);
1769	/ An SRCU barrier or callbacks from previous nesting work pending /
1770	if (more)
1771	srcu_schedule_cbs_sdp(sdp, delay: `0`);
1772	}
1773
1774	/*
1775	* Finished one round of SRCU grace period. Start another if there are
1776	* more SRCU callbacks queued, otherwise put SRCU into not-running state.
1777	*/
1778	static void srcu_reschedule(struct srcu_struct ssp, unsigned* long delay)
1779	{
1780	bool pushgp = true;
1781
1782	spin_lock_irq_rcu_node(ssp->srcu_sup);
1783	if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
1784	if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
1785	/ All requests fulfilled, time to go idle. /
1786	pushgp = false;
1787	}
1788	} else if (!rcu_seq_state(s: ssp->srcu_sup->srcu_gp_seq)) {
1789	/ Outstanding request and no GP. Start one. /
1790	srcu_gp_start(ssp);
1791	}
1792	spin_unlock_irq_rcu_node(ssp->srcu_sup);
1793
1794	if (pushgp)
1795	queue_delayed_work(wq: rcu_gp_wq, dwork: &ssp->srcu_sup->work, delay);
1796	}
1797
1798	/*
1799	* This is the work-queue function that handles SRCU grace periods.
1800	*/
1801	static void process_srcu(struct work_struct *work)
1802	{
1803	unsigned long curdelay;
1804	unsigned long j;
1805	struct srcu_struct *ssp;
1806	struct srcu_usage *sup;
1807
1808	sup = container_of(work, struct srcu_usage, work.work);
1809	ssp = sup->srcu_ssp;
1810
1811	srcu_advance_state(ssp);
1812	curdelay = srcu_get_delay(ssp);
1813	if (curdelay) {
1814	WRITE_ONCE(sup->reschedule_count, `0`);
1815	} else {
1816	j = jiffies;
1817	if (READ_ONCE(sup->reschedule_jiffies) == j) {
1818	WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + `1`);
1819	if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
1820	curdelay = `1`;
1821	} else {
1822	WRITE_ONCE(sup->reschedule_count, `1`);
1823	WRITE_ONCE(sup->reschedule_jiffies, j);
1824	}
1825	}
1826	srcu_reschedule(ssp, delay: curdelay);
1827	}
1828
1829	void srcutorture_get_gp_data(enum rcutorture_type test_type,
1830	struct srcu_struct ssp, int* *flags,
1831	unsigned long *gp_seq)
1832	{
1833	if (test_type != SRCU_FLAVOR)
1834	return;
1835	*flags = `0`;
1836	*gp_seq = rcu_seq_current(sp: &ssp->srcu_sup->srcu_gp_seq);
1837	}
1838	EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
1839
1840	static const char * const srcu_size_state_name[] = {
1841	"SRCU_SIZE_SMALL",
1842	"SRCU_SIZE_ALLOC",
1843	"SRCU_SIZE_WAIT_BARRIER",
1844	"SRCU_SIZE_WAIT_CALL",
1845	"SRCU_SIZE_WAIT_CBS1",
1846	"SRCU_SIZE_WAIT_CBS2",
1847	"SRCU_SIZE_WAIT_CBS3",
1848	"SRCU_SIZE_WAIT_CBS4",
1849	"SRCU_SIZE_BIG",
1850	"SRCU_SIZE_???",
1851	};
1852
1853	void srcu_torture_stats_print(struct srcu_struct ssp, char* tt, char* *tf)
1854	{
1855	int cpu;
1856	int idx;
1857	unsigned long s0 = `0`, s1 = `0`;
1858	int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
1859	int ss_state_idx = ss_state;
1860
1861	idx = ssp->srcu_idx & `0x1`;
1862	if (ss_state < `0` \|\| ss_state >= ARRAY_SIZE(srcu_size_state_name))
1863	ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - `1`;
1864	pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
1865	tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
1866	srcu_size_state_name[ss_state_idx]);
1867	if (!ssp->sda) {
1868	// Called after cleanup_srcu_struct(), perhaps.
1869	pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n");
1870	} else {
1871	pr_cont(" per-CPU(idx=%d):", idx);
1872	for_each_possible_cpu(cpu) {
1873	unsigned long l0, l1;
1874	unsigned long u0, u1;
1875	long c0, c1;
1876	struct srcu_data *sdp;
1877
1878	sdp = per_cpu_ptr(ssp->sda, cpu);
1879	u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
1880	u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
1881
1882	/*
1883	* Make sure that a lock is always counted if the corresponding
1884	* unlock is counted.
1885	*/
1886	smp_rmb();
1887
1888	l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
1889	l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
1890
1891	c0 = l0 - u0;
1892	c1 = l1 - u1;
1893	pr_cont(" %d(%ld,%ld %c)",
1894	cpu, c0, c1,
1895	"C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
1896	s0 += c0;
1897	s1 += c1;
1898	}
1899	pr_cont(" T(%ld,%ld)\n", s0, s1);
1900	}
1901	if (SRCU_SIZING_IS_TORTURE())
1902	srcu_transition_to_big(ssp);
1903	}
1904	EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
1905
1906	static int __init srcu_bootup_announce(void)
1907	{
1908	pr_info("Hierarchical SRCU implementation.\n");
1909	if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
1910	pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
1911	if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
1912	pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
1913	if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
1914	pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
1915	pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
1916	return `0`;
1917	}
1918	early_initcall(srcu_bootup_announce);
1919
1920	void __init srcu_init(void)
1921	{
1922	struct srcu_usage *sup;
1923
1924	/ Decide on srcu_struct-size strategy. /
1925	if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
1926	if (nr_cpu_ids >= big_cpu_lim) {
1927	convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention.
1928	pr_info("%s: Setting srcu_struct sizes to big.\n", __func__);
1929	} else {
1930	convert_to_big = SRCU_SIZING_NONE \| SRCU_SIZING_CONTEND;
1931	pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__);
1932	}
1933	}
1934
1935	/*
1936	* Once that is set, call_srcu() can follow the normal path and
1937	* queue delayed work. This must follow RCU workqueues creation
1938	* and timers initialization.
1939	*/
1940	srcu_init_done = true;
1941	while (!list_empty(head: &srcu_boot_list)) {
1942	sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
1943	work.work.entry);
1944	list_del_init(entry: &sup->work.work.entry);
1945	if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
1946	sup->srcu_size_state == SRCU_SIZE_SMALL)
1947	sup->srcu_size_state = SRCU_SIZE_ALLOC;
1948	queue_work(wq: rcu_gp_wq, work: &sup->work.work);
1949	}
1950	}
1951
1952	#ifdef CONFIG_MODULES
1953
1954	/ Initialize any global-scope srcu_struct structures used by this module. /
1955	static int srcu_module_coming(struct module *mod)
1956	{
1957	int i;
1958	struct srcu_struct *ssp;
1959	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
1960
1961	for (i = `0`; i < mod->num_srcu_structs; i++) {
1962	ssp = *(sspp++);
1963	ssp->sda = alloc_percpu(struct srcu_data);
1964	if (WARN_ON_ONCE(!ssp->sda))
1965	return -ENOMEM;
1966	}
1967	return `0`;
1968	}
1969
1970	/ Clean up any global-scope srcu_struct structures used by this module. /
1971	static void srcu_module_going(struct module *mod)
1972	{
1973	int i;
1974	struct srcu_struct *ssp;
1975	struct srcu_struct **sspp = mod->srcu_struct_ptrs;
1976
1977	for (i = `0`; i < mod->num_srcu_structs; i++) {
1978	ssp = *(sspp++);
1979	if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
1980	!WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
1981	cleanup_srcu_struct(ssp);
1982	if (!WARN_ON(srcu_readers_active(ssp)))
1983	free_percpu(pdata: ssp->sda);
1984	}
1985	}
1986
1987	/ Handle one module, either coming or going. /
1988	static int srcu_module_notify(struct notifier_block *self,
1989	unsigned long val, void *data)
1990	{
1991	struct module *mod = data;
1992	int ret = `0`;
1993
1994	switch (val) {
1995	case MODULE_STATE_COMING:
1996	ret = srcu_module_coming(mod);
1997	break;
1998	case MODULE_STATE_GOING:
1999	srcu_module_going(mod);
2000	break;
2001	default:
2002	break;
2003	}
2004	return ret;
2005	}
2006
2007	static struct notifier_block srcu_module_nb = {
2008	.notifier_call = srcu_module_notify,
2009	.priority = `0`,
2010	};
2011
2012	static __init int init_srcu_module_notifier(void)
2013	{
2014	int ret;
2015
2016	ret = register_module_notifier(nb: &srcu_module_nb);
2017	if (ret)
2018	pr_warn("Failed to register srcu module notifier\n");
2019	return ret;
2020	}
2021	late_initcall(init_srcu_module_notifier);
2022
2023	#endif /* #ifdef CONFIG_MODULES */
2024

source code of linux/kernel/rcu/srcutree.c