1 | // SPDX-License-Identifier: GPL-2.0 |
---|---|
2 | /* |
3 | * Fast batching percpu counters. |
4 | */ |
5 | |
6 | #include <linux/percpu_counter.h> |
7 | #include <linux/mutex.h> |
8 | #include <linux/init.h> |
9 | #include <linux/cpu.h> |
10 | #include <linux/module.h> |
11 | #include <linux/debugobjects.h> |
12 | |
13 | #ifdef CONFIG_HOTPLUG_CPU |
14 | static LIST_HEAD(percpu_counters); |
15 | static DEFINE_SPINLOCK(percpu_counters_lock); |
16 | #endif |
17 | |
18 | #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER |
19 | |
20 | static const struct debug_obj_descr percpu_counter_debug_descr; |
21 | |
22 | static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) |
23 | { |
24 | struct percpu_counter *fbc = addr; |
25 | |
26 | switch (state) { |
27 | case ODEBUG_STATE_ACTIVE: |
28 | percpu_counter_destroy(fbc); |
29 | debug_object_free(addr: fbc, descr: &percpu_counter_debug_descr); |
30 | return true; |
31 | default: |
32 | return false; |
33 | } |
34 | } |
35 | |
36 | static const struct debug_obj_descr percpu_counter_debug_descr = { |
37 | .name = "percpu_counter", |
38 | .fixup_free = percpu_counter_fixup_free, |
39 | }; |
40 | |
41 | static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) |
42 | { |
43 | debug_object_init(addr: fbc, descr: &percpu_counter_debug_descr); |
44 | debug_object_activate(addr: fbc, descr: &percpu_counter_debug_descr); |
45 | } |
46 | |
47 | static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) |
48 | { |
49 | debug_object_deactivate(addr: fbc, descr: &percpu_counter_debug_descr); |
50 | debug_object_free(addr: fbc, descr: &percpu_counter_debug_descr); |
51 | } |
52 | |
53 | #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ |
54 | static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) |
55 | { } |
56 | static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) |
57 | { } |
58 | #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ |
59 | |
60 | void percpu_counter_set(struct percpu_counter *fbc, s64 amount) |
61 | { |
62 | int cpu; |
63 | unsigned long flags; |
64 | |
65 | raw_spin_lock_irqsave(&fbc->lock, flags); |
66 | for_each_possible_cpu(cpu) { |
67 | s32 *pcount = per_cpu_ptr(fbc->counters, cpu); |
68 | *pcount = 0; |
69 | } |
70 | fbc->count = amount; |
71 | raw_spin_unlock_irqrestore(&fbc->lock, flags); |
72 | } |
73 | EXPORT_SYMBOL(percpu_counter_set); |
74 | |
75 | /* |
76 | * local_irq_save() is needed to make the function irq safe: |
77 | * - The slow path would be ok as protected by an irq-safe spinlock. |
78 | * - this_cpu_add would be ok as it is irq-safe by definition. |
79 | * But: |
80 | * The decision slow path/fast path and the actual update must be atomic, too. |
81 | * Otherwise a call in process context could check the current values and |
82 | * decide that the fast path can be used. If now an interrupt occurs before |
83 | * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), |
84 | * then the this_cpu_add() that is executed after the interrupt has completed |
85 | * can produce values larger than "batch" or even overflows. |
86 | */ |
87 | void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) |
88 | { |
89 | s64 count; |
90 | unsigned long flags; |
91 | |
92 | local_irq_save(flags); |
93 | count = __this_cpu_read(*fbc->counters) + amount; |
94 | if (abs(count) >= batch) { |
95 | raw_spin_lock(&fbc->lock); |
96 | fbc->count += count; |
97 | __this_cpu_sub(*fbc->counters, count - amount); |
98 | raw_spin_unlock(&fbc->lock); |
99 | } else { |
100 | this_cpu_add(*fbc->counters, amount); |
101 | } |
102 | local_irq_restore(flags); |
103 | } |
104 | EXPORT_SYMBOL(percpu_counter_add_batch); |
105 | |
106 | /* |
107 | * For percpu_counter with a big batch, the devication of its count could |
108 | * be big, and there is requirement to reduce the deviation, like when the |
109 | * counter's batch could be runtime decreased to get a better accuracy, |
110 | * which can be achieved by running this sync function on each CPU. |
111 | */ |
112 | void percpu_counter_sync(struct percpu_counter *fbc) |
113 | { |
114 | unsigned long flags; |
115 | s64 count; |
116 | |
117 | raw_spin_lock_irqsave(&fbc->lock, flags); |
118 | count = __this_cpu_read(*fbc->counters); |
119 | fbc->count += count; |
120 | __this_cpu_sub(*fbc->counters, count); |
121 | raw_spin_unlock_irqrestore(&fbc->lock, flags); |
122 | } |
123 | EXPORT_SYMBOL(percpu_counter_sync); |
124 | |
125 | /* |
126 | * Add up all the per-cpu counts, return the result. This is a more accurate |
127 | * but much slower version of percpu_counter_read_positive(). |
128 | * |
129 | * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums |
130 | * from CPUs that are in the process of being taken offline. Dying cpus have |
131 | * been removed from the online mask, but may not have had the hotplug dead |
132 | * notifier called to fold the percpu count back into the global counter sum. |
133 | * By including dying CPUs in the iteration mask, we avoid this race condition |
134 | * so __percpu_counter_sum() just does the right thing when CPUs are being taken |
135 | * offline. |
136 | */ |
137 | s64 __percpu_counter_sum(struct percpu_counter *fbc) |
138 | { |
139 | s64 ret; |
140 | int cpu; |
141 | unsigned long flags; |
142 | |
143 | raw_spin_lock_irqsave(&fbc->lock, flags); |
144 | ret = fbc->count; |
145 | for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { |
146 | s32 *pcount = per_cpu_ptr(fbc->counters, cpu); |
147 | ret += *pcount; |
148 | } |
149 | raw_spin_unlock_irqrestore(&fbc->lock, flags); |
150 | return ret; |
151 | } |
152 | EXPORT_SYMBOL(__percpu_counter_sum); |
153 | |
154 | int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, |
155 | gfp_t gfp, u32 nr_counters, |
156 | struct lock_class_key *key) |
157 | { |
158 | unsigned long flags __maybe_unused; |
159 | size_t counter_size; |
160 | s32 __percpu *counters; |
161 | u32 i; |
162 | |
163 | counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); |
164 | counters = __alloc_percpu_gfp(size: nr_counters * counter_size, |
165 | align: __alignof__(*counters), gfp); |
166 | if (!counters) { |
167 | fbc[0].counters = NULL; |
168 | return -ENOMEM; |
169 | } |
170 | |
171 | for (i = 0; i < nr_counters; i++) { |
172 | raw_spin_lock_init(&fbc[i].lock); |
173 | lockdep_set_class(&fbc[i].lock, key); |
174 | #ifdef CONFIG_HOTPLUG_CPU |
175 | INIT_LIST_HEAD(list: &fbc[i].list); |
176 | #endif |
177 | fbc[i].count = amount; |
178 | fbc[i].counters = (void *)counters + (i * counter_size); |
179 | |
180 | debug_percpu_counter_activate(fbc: &fbc[i]); |
181 | } |
182 | |
183 | #ifdef CONFIG_HOTPLUG_CPU |
184 | spin_lock_irqsave(&percpu_counters_lock, flags); |
185 | for (i = 0; i < nr_counters; i++) |
186 | list_add(new: &fbc[i].list, head: &percpu_counters); |
187 | spin_unlock_irqrestore(lock: &percpu_counters_lock, flags); |
188 | #endif |
189 | return 0; |
190 | } |
191 | EXPORT_SYMBOL(__percpu_counter_init_many); |
192 | |
193 | void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) |
194 | { |
195 | unsigned long flags __maybe_unused; |
196 | u32 i; |
197 | |
198 | if (WARN_ON_ONCE(!fbc)) |
199 | return; |
200 | |
201 | if (!fbc[0].counters) |
202 | return; |
203 | |
204 | for (i = 0; i < nr_counters; i++) |
205 | debug_percpu_counter_deactivate(fbc: &fbc[i]); |
206 | |
207 | #ifdef CONFIG_HOTPLUG_CPU |
208 | spin_lock_irqsave(&percpu_counters_lock, flags); |
209 | for (i = 0; i < nr_counters; i++) |
210 | list_del(entry: &fbc[i].list); |
211 | spin_unlock_irqrestore(lock: &percpu_counters_lock, flags); |
212 | #endif |
213 | |
214 | free_percpu(pdata: fbc[0].counters); |
215 | |
216 | for (i = 0; i < nr_counters; i++) |
217 | fbc[i].counters = NULL; |
218 | } |
219 | EXPORT_SYMBOL(percpu_counter_destroy_many); |
220 | |
221 | int percpu_counter_batch __read_mostly = 32; |
222 | EXPORT_SYMBOL(percpu_counter_batch); |
223 | |
224 | static int compute_batch_value(unsigned int cpu) |
225 | { |
226 | int nr = num_online_cpus(); |
227 | |
228 | percpu_counter_batch = max(32, nr*2); |
229 | return 0; |
230 | } |
231 | |
232 | static int percpu_counter_cpu_dead(unsigned int cpu) |
233 | { |
234 | #ifdef CONFIG_HOTPLUG_CPU |
235 | struct percpu_counter *fbc; |
236 | |
237 | compute_batch_value(cpu); |
238 | |
239 | spin_lock_irq(lock: &percpu_counters_lock); |
240 | list_for_each_entry(fbc, &percpu_counters, list) { |
241 | s32 *pcount; |
242 | |
243 | raw_spin_lock(&fbc->lock); |
244 | pcount = per_cpu_ptr(fbc->counters, cpu); |
245 | fbc->count += *pcount; |
246 | *pcount = 0; |
247 | raw_spin_unlock(&fbc->lock); |
248 | } |
249 | spin_unlock_irq(lock: &percpu_counters_lock); |
250 | #endif |
251 | return 0; |
252 | } |
253 | |
254 | /* |
255 | * Compare counter against given value. |
256 | * Return 1 if greater, 0 if equal and -1 if less |
257 | */ |
258 | int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) |
259 | { |
260 | s64 count; |
261 | |
262 | count = percpu_counter_read(fbc); |
263 | /* Check to see if rough count will be sufficient for comparison */ |
264 | if (abs(count - rhs) > (batch * num_online_cpus())) { |
265 | if (count > rhs) |
266 | return 1; |
267 | else |
268 | return -1; |
269 | } |
270 | /* Need to use precise count */ |
271 | count = percpu_counter_sum(fbc); |
272 | if (count > rhs) |
273 | return 1; |
274 | else if (count < rhs) |
275 | return -1; |
276 | else |
277 | return 0; |
278 | } |
279 | EXPORT_SYMBOL(__percpu_counter_compare); |
280 | |
281 | /* |
282 | * Compare counter, and add amount if total is: less than or equal to limit if |
283 | * amount is positive, or greater than or equal to limit if amount is negative. |
284 | * Return true if amount is added, or false if total would be beyond the limit. |
285 | * |
286 | * Negative limit is allowed, but unusual. |
287 | * When negative amounts (subs) are given to percpu_counter_limited_add(), |
288 | * the limit would most naturally be 0 - but other limits are also allowed. |
289 | * |
290 | * Overflow beyond S64_MAX is not allowed for: counter, limit and amount |
291 | * are all assumed to be sane (far from S64_MIN and S64_MAX). |
292 | */ |
293 | bool __percpu_counter_limited_add(struct percpu_counter *fbc, |
294 | s64 limit, s64 amount, s32 batch) |
295 | { |
296 | s64 count; |
297 | s64 unknown; |
298 | unsigned long flags; |
299 | bool good = false; |
300 | |
301 | if (amount == 0) |
302 | return true; |
303 | |
304 | local_irq_save(flags); |
305 | unknown = batch * num_online_cpus(); |
306 | count = __this_cpu_read(*fbc->counters); |
307 | |
308 | /* Skip taking the lock when safe */ |
309 | if (abs(count + amount) <= batch && |
310 | ((amount > 0 && fbc->count + unknown <= limit) || |
311 | (amount < 0 && fbc->count - unknown >= limit))) { |
312 | this_cpu_add(*fbc->counters, amount); |
313 | local_irq_restore(flags); |
314 | return true; |
315 | } |
316 | |
317 | raw_spin_lock(&fbc->lock); |
318 | count = fbc->count + amount; |
319 | |
320 | /* Skip percpu_counter_sum() when safe */ |
321 | if (amount > 0) { |
322 | if (count - unknown > limit) |
323 | goto out; |
324 | if (count + unknown <= limit) |
325 | good = true; |
326 | } else { |
327 | if (count + unknown < limit) |
328 | goto out; |
329 | if (count - unknown >= limit) |
330 | good = true; |
331 | } |
332 | |
333 | if (!good) { |
334 | s32 *pcount; |
335 | int cpu; |
336 | |
337 | for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { |
338 | pcount = per_cpu_ptr(fbc->counters, cpu); |
339 | count += *pcount; |
340 | } |
341 | if (amount > 0) { |
342 | if (count > limit) |
343 | goto out; |
344 | } else { |
345 | if (count < limit) |
346 | goto out; |
347 | } |
348 | good = true; |
349 | } |
350 | |
351 | count = __this_cpu_read(*fbc->counters); |
352 | fbc->count += count + amount; |
353 | __this_cpu_sub(*fbc->counters, count); |
354 | out: |
355 | raw_spin_unlock(&fbc->lock); |
356 | local_irq_restore(flags); |
357 | return good; |
358 | } |
359 | |
360 | static int __init percpu_counter_startup(void) |
361 | { |
362 | int ret; |
363 | |
364 | ret = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "lib/percpu_cnt:online", |
365 | startup: compute_batch_value, NULL); |
366 | WARN_ON(ret < 0); |
367 | ret = cpuhp_setup_state_nocalls(state: CPUHP_PERCPU_CNT_DEAD, |
368 | name: "lib/percpu_cnt:dead", NULL, |
369 | teardown: percpu_counter_cpu_dead); |
370 | WARN_ON(ret < 0); |
371 | return 0; |
372 | } |
373 | module_init(percpu_counter_startup); |
374 |
Definitions
- percpu_counters
- percpu_counters_lock
- percpu_counter_debug_descr
- percpu_counter_fixup_free
- percpu_counter_debug_descr
- debug_percpu_counter_activate
- debug_percpu_counter_deactivate
- percpu_counter_set
- percpu_counter_add_batch
- percpu_counter_sync
- __percpu_counter_sum
- __percpu_counter_init_many
- percpu_counter_destroy_many
- percpu_counter_batch
- compute_batch_value
- percpu_counter_cpu_dead
- __percpu_counter_compare
- __percpu_counter_limited_add
Improve your Profiling and Debugging skills
Find out more