tlb.c source code [linux/arch/x86/mm/tlb.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	#include <linux/init.h>
3
4	#include <linux/mm.h>
5	#include <linux/spinlock.h>
6	#include <linux/smp.h>
7	#include <linux/interrupt.h>
8	#include <linux/export.h>
9	#include <linux/cpu.h>
10	#include <linux/debugfs.h>
11	#include <linux/sched/smt.h>
12	#include <linux/task_work.h>
13	#include <linux/mmu_notifier.h>
14	#include <linux/mmu_context.h>
15	#include <linux/kvm_types.h>
16
17	#include <asm/tlbflush.h>
18	#include <asm/mmu_context.h>
19	#include <asm/nospec-branch.h>
20	#include <asm/cache.h>
21	#include <asm/cacheflush.h>
22	#include <asm/apic.h>
23	#include <asm/msr.h>
24	#include <asm/perf_event.h>
25	#include <asm/tlb.h>
26
27	#include "mm_internal.h"
28
29	#ifdef CONFIG_PARAVIRT
30	# define STATIC_NOPV
31	#else
32	# define STATIC_NOPV static
33	# define __flush_tlb_local native_flush_tlb_local
34	# define __flush_tlb_global native_flush_tlb_global
35	# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
36	# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
37	#endif
38
39	/*
40	* TLB flushing, formerly SMP-only
41	* c/o Linus Torvalds.
42	*
43	* These mean you can really definitely utterly forget about
44	* writing to user space from interrupts. (Its not allowed anyway).
45	*
46	* Optimizations Manfred Spraul <manfred@colorfullife.com>
47	*
48	* More scalable flush, from Andi Kleen
49	*
50	* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
51	*/
52
53	/*
54	* Bits to mangle the TIF_SPEC_* state into the mm pointer which is
55	* stored in cpu_tlb_state.last_user_mm_spec.
56	*/
57	#define LAST_USER_MM_IBPB 0x1UL
58	#define LAST_USER_MM_L1D_FLUSH 0x2UL
59	#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB \| LAST_USER_MM_L1D_FLUSH)
60
61	/ Bits to set when tlbstate and flush is (re)initialized /
62	#define LAST_USER_MM_INIT LAST_USER_MM_IBPB
63
64	/*
65	* The x86 feature is called PCID (Process Context IDentifier). It is similar
66	* to what is traditionally called ASID on the RISC processors.
67	*
68	* We don't use the traditional ASID implementation, where each process/mm gets
69	* its own ASID and flush/restart when we run out of ASID space.
70	*
71	* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
72	* that came by on this CPU, allowing cheaper switch_mm between processes on
73	* this CPU.
74	*
75	* We end up with different spaces for different things. To avoid confusion we
76	* use different names for each of them:
77	*
78	* ASID - [0, TLB_NR_DYN_ASIDS-1]
79	* the canonical identifier for an mm, dynamically allocated on each CPU
80	* [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
81	* the canonical, global identifier for an mm, identical across all CPUs
82	*
83	* kPCID - [1, MAX_ASID_AVAILABLE]
84	* the value we write into the PCID part of CR3; corresponds to the
85	* ASID+1, because PCID 0 is special.
86	*
87	* uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
88	* for KPTI each mm has two address spaces and thus needs two
89	* PCID values, but we can still do with a single ASID denomination
90	* for each mm. Corresponds to kPCID + 2048.
91	*
92	*/
93
94	/*
95	* When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
96	* user/kernel switches
97	*/
98	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
99	# define PTI_CONSUMED_PCID_BITS 1
100	#else
101	# define PTI_CONSUMED_PCID_BITS 0
102	#endif
103
104	#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
105
106	/*
107	* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
108	* for them being zero-based. Another -1 is because PCID 0 is reserved for
109	* use by non-PCID-aware users.
110	*/
111	#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
112
113	/*
114	* Given @asid, compute kPCID
115	*/
116	static inline u16 kern_pcid(u16 asid)
117	{
118	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
119
120	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
121	/*
122	* Make sure that the dynamic ASID space does not conflict with the
123	* bit we are using to switch between user and kernel ASIDs.
124	*/
125	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (`1` << X86_CR3_PTI_PCID_USER_BIT));
126
127	/*
128	* The ASID being passed in here should have respected the
129	* MAX_ASID_AVAILABLE and thus never have the switch bit set.
130	*/
131	VM_WARN_ON_ONCE(asid & (`1` << X86_CR3_PTI_PCID_USER_BIT));
132	#endif
133	/*
134	* The dynamically-assigned ASIDs that get passed in are small
135	* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
136	* so do not bother to clear it.
137	*
138	* If PCID is on, ASID-aware code paths put the ASID+1 into the
139	* PCID bits. This serves two purposes. It prevents a nasty
140	* situation in which PCID-unaware code saves CR3, loads some other
141	* value (with PCID == 0), and then restores CR3, thus corrupting
142	* the TLB for ASID 0 if the saved ASID was nonzero. It also means
143	* that any bugs involving loading a PCID-enabled CR3 with
144	* CR4.PCIDE off will trigger deterministically.
145	*/
146	return asid + `1`;
147	}
148
149	/*
150	* Given @asid, compute uPCID
151	*/
152	static inline u16 user_pcid(u16 asid)
153	{
154	u16 ret = kern_pcid(asid);
155	#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
156	ret \|= `1` << X86_CR3_PTI_PCID_USER_BIT;
157	#endif
158	return ret;
159	}
160
161	static inline unsigned long build_cr3(pgd_t pgd, u16 asid, unsigned* long lam)
162	{
163	unsigned long cr3 = __sme_pa(pgd) \| lam;
164
165	if (static_cpu_has(X86_FEATURE_PCID)) {
166	cr3 \|= kern_pcid(asid);
167	} else {
168	VM_WARN_ON_ONCE(asid != `0`);
169	}
170
171	return cr3;
172	}
173
174	static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
175	unsigned long lam)
176	{
177	/*
178	* Use boot_cpu_has() instead of this_cpu_has() as this function
179	* might be called during early boot. This should work even after
180	* boot because all CPU's the have same capabilities:
181	*/
182	VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
183	return build_cr3(pgd, asid, lam) \| CR3_NOFLUSH;
184	}
185
186	/*
187	* We get here when we do something requiring a TLB invalidation
188	* but could not go invalidate all of the contexts. We do the
189	* necessary invalidation by clearing out the 'ctx_id' which
190	* forces a TLB flush when the context is loaded.
191	*/
192	static void clear_asid_other(void)
193	{
194	u16 asid;
195
196	/*
197	* This is only expected to be set if we have disabled
198	* kernel _PAGE_GLOBAL pages.
199	*/
200	if (!static_cpu_has(X86_FEATURE_PTI)) {
201	WARN_ON_ONCE(`1`);
202	return;
203	}
204
205	for (asid = `0`; asid < TLB_NR_DYN_ASIDS; asid++) {
206	/ Do not need to flush the current asid /
207	if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
208	continue;
209	/*
210	* Make sure the next time we go to switch to
211	* this asid, we do a flush:
212	*/
213	this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, `0`);
214	}
215	this_cpu_write(cpu_tlbstate.invalidate_other, false);
216	}
217
218	atomic64_t last_mm_ctx_id = ATOMIC64_INIT(`1`);
219
220	struct new_asid {
221	unsigned int asid : `16`;
222	unsigned int need_flush : `1`;
223	};
224
225	static struct new_asid choose_new_asid(struct mm_struct *next, u64 next_tlb_gen)
226	{
227	struct new_asid ns;
228	u16 asid;
229
230	if (!static_cpu_has(X86_FEATURE_PCID)) {
231	ns.asid = `0`;
232	ns.need_flush = `1`;
233	return ns;
234	}
235
236	/*
237	* TLB consistency for global ASIDs is maintained with hardware assisted
238	* remote TLB flushing. Global ASIDs are always up to date.
239	*/
240	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
241	u16 global_asid = mm_global_asid(mm: next);
242
243	if (global_asid) {
244	ns.asid = global_asid;
245	ns.need_flush = `0`;
246	return ns;
247	}
248	}
249
250	if (this_cpu_read(cpu_tlbstate.invalidate_other))
251	clear_asid_other();
252
253	for (asid = `0`; asid < TLB_NR_DYN_ASIDS; asid++) {
254	if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
255	next->context.ctx_id)
256	continue;
257
258	ns.asid = asid;
259	ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen);
260	return ns;
261	}
262
263	/*
264	* We don't currently own an ASID slot on this CPU.
265	* Allocate a slot.
266	*/
267	ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, `1`) - `1`;
268	if (ns.asid >= TLB_NR_DYN_ASIDS) {
269	ns.asid = `0`;
270	this_cpu_write(cpu_tlbstate.next_asid, `1`);
271	}
272	ns.need_flush = true;
273
274	return ns;
275	}
276
277	/*
278	* Global ASIDs are allocated for multi-threaded processes that are
279	* active on multiple CPUs simultaneously, giving each of those
280	* processes the same PCID on every CPU, for use with hardware-assisted
281	* TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR.
282	*
283	* These global ASIDs are held for the lifetime of the process.
284	*/
285	static DEFINE_RAW_SPINLOCK(global_asid_lock);
286	static u16 last_global_asid = MAX_ASID_AVAILABLE;
287	static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE);
288	static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE);
289	static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - `1`;
290
291	/*
292	* When the search for a free ASID in the global ASID space reaches
293	* MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously
294	* freed global ASIDs are safe to re-use.
295	*
296	* This way the global flush only needs to happen at ASID rollover
297	* time, and not at ASID allocation time.
298	*/
299	static void reset_global_asid_space(void)
300	{
301	lockdep_assert_held(&global_asid_lock);
302
303	invlpgb_flush_all_nonglobals();
304
305	/*
306	* The TLB flush above makes it safe to re-use the previously
307	* freed global ASIDs.
308	*/
309	bitmap_andnot(dst: global_asid_used, src1: global_asid_used,
310	src2: global_asid_freed, MAX_ASID_AVAILABLE);
311	bitmap_clear(map: global_asid_freed, start: `0`, MAX_ASID_AVAILABLE);
312
313	/ Restart the search from the start of global ASID space. /
314	last_global_asid = TLB_NR_DYN_ASIDS;
315	}
316
317	static u16 allocate_global_asid(void)
318	{
319	u16 asid;
320
321	lockdep_assert_held(&global_asid_lock);
322
323	/ The previous allocation hit the edge of available address space /
324	if (last_global_asid >= MAX_ASID_AVAILABLE - `1`)
325	reset_global_asid_space();
326
327	asid = find_next_zero_bit(addr: global_asid_used, MAX_ASID_AVAILABLE, offset: last_global_asid);
328
329	if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) {
330	/ This should never happen. /
331	VM_WARN_ONCE(`1`, "Unable to allocate global ASID despite %d available\n",
332	global_asid_available);
333	return `0`;
334	}
335
336	/ Claim this global ASID. /
337	__set_bit(asid, global_asid_used);
338	last_global_asid = asid;
339	global_asid_available--;
340	return asid;
341	}
342
343	/*
344	* Check whether a process is currently active on more than @threshold CPUs.
345	* This is a cheap estimation on whether or not it may make sense to assign
346	* a global ASID to this process, and use broadcast TLB invalidation.
347	*/
348	static bool mm_active_cpus_exceeds(struct mm_struct mm, int* threshold)
349	{
350	int count = `0`;
351	int cpu;
352
353	/ This quick check should eliminate most single threaded programs. /
354	if (cpumask_weight(srcp: mm_cpumask(mm)) <= threshold)
355	return false;
356
357	/ Slower check to make sure. /
358	for_each_cpu(cpu, mm_cpumask(mm)) {
359	/ Skip the CPUs that aren't really running this process. /
360	if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)
361	continue;
362
363	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
364	continue;
365
366	if (++count > threshold)
367	return true;
368	}
369	return false;
370	}
371
372	/*
373	* Assign a global ASID to the current process, protecting against
374	* races between multiple threads in the process.
375	*/
376	static void use_global_asid(struct mm_struct *mm)
377	{
378	u16 asid;
379
380	guard(raw_spinlock_irqsave)(l: &global_asid_lock);
381
382	/ This process is already using broadcast TLB invalidation. /
383	if (mm_global_asid(mm))
384	return;
385
386	/*
387	* The last global ASID was consumed while waiting for the lock.
388	*
389	* If this fires, a more aggressive ASID reuse scheme might be
390	* needed.
391	*/
392	if (!global_asid_available) {
393	VM_WARN_ONCE(`1`, "Ran out of global ASIDs\n");
394	return;
395	}
396
397	asid = allocate_global_asid();
398	if (!asid)
399	return;
400
401	mm_assign_global_asid(mm, asid);
402	}
403
404	void mm_free_global_asid(struct mm_struct *mm)
405	{
406	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
407	return;
408
409	if (!mm_global_asid(mm))
410	return;
411
412	guard(raw_spinlock_irqsave)(l: &global_asid_lock);
413
414	/ The global ASID can be re-used only after flush at wrap-around. /
415	#ifdef CONFIG_BROADCAST_TLB_FLUSH
416	__set_bit(mm->context.global_asid, global_asid_freed);
417
418	mm->context.global_asid = `0`;
419	global_asid_available++;
420	#endif
421	}
422
423	/*
424	* Is the mm transitioning from a CPU-local ASID to a global ASID?
425	*/
426	static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid)
427	{
428	u16 global_asid = mm_global_asid(mm);
429
430	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
431	return false;
432
433	/ Process is transitioning to a global ASID /
434	if (global_asid && asid != global_asid)
435	return true;
436
437	return false;
438	}
439
440	/*
441	* x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86
442	* systems have over 8k CPUs. Because of this potential ASID shortage,
443	* global ASIDs are handed out to processes that have frequent TLB
444	* flushes and are active on 4 or more CPUs simultaneously.
445	*/
446	static void consider_global_asid(struct mm_struct *mm)
447	{
448	if (!cpu_feature_enabled(X86_FEATURE_INVLPGB))
449	return;
450
451	/ Check every once in a while. /
452	if ((current->pid & `0x1f`) != (jiffies & `0x1f`))
453	return;
454
455	/*
456	* Assign a global ASID if the process is active on
457	* 4 or more CPUs simultaneously.
458	*/
459	if (mm_active_cpus_exceeds(mm, threshold: `3`))
460	use_global_asid(mm);
461	}
462
463	static void finish_asid_transition(struct flush_tlb_info *info)
464	{
465	struct mm_struct *mm = info->mm;
466	int bc_asid = mm_global_asid(mm);
467	int cpu;
468
469	if (!mm_in_asid_transition(mm))
470	return;
471
472	for_each_cpu(cpu, mm_cpumask(mm)) {
473	/*
474	* The remote CPU is context switching. Wait for that to
475	* finish, to catch the unlikely case of it switching to
476	* the target mm with an out of date ASID.
477	*/
478	while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)
479	cpu_relax();
480
481	if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)
482	continue;
483
484	/*
485	* If at least one CPU is not using the global ASID yet,
486	* send a TLB flush IPI. The IPI should cause stragglers
487	* to transition soon.
488	*
489	* This can race with the CPU switching to another task;
490	* that results in a (harmless) extra IPI.
491	*/
492	if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {
493	flush_tlb_multi(cpumask: mm_cpumask(mm: info->mm), info);
494	return;
495	}
496	}
497
498	/ All the CPUs running this process are using the global ASID. /
499	mm_clear_asid_transition(mm);
500	}
501
502	static void broadcast_tlb_flush(struct flush_tlb_info *info)
503	{
504	bool pmd = info->stride_shift == PMD_SHIFT;
505	unsigned long asid = mm_global_asid(mm: info->mm);
506	unsigned long addr = info->start;
507
508	/*
509	* TLB flushes with INVLPGB are kicked off asynchronously.
510	* The inc_mm_tlb_gen() guarantees page table updates are done
511	* before these TLB flushes happen.
512	*/
513	if (info->end == TLB_FLUSH_ALL) {
514	invlpgb_flush_single_pcid_nosync(pcid: kern_pcid(asid));
515	/ Do any CPUs supporting INVLPGB need PTI? /
516	if (cpu_feature_enabled(X86_FEATURE_PTI))
517	invlpgb_flush_single_pcid_nosync(pcid: user_pcid(asid));
518	} else do {
519	unsigned long nr = `1`;
520
521	if (info->stride_shift <= PMD_SHIFT) {
522	nr = (info->end - addr) >> info->stride_shift;
523	nr = clamp_val(nr, `1`, invlpgb_count_max);
524	}
525
526	invlpgb_flush_user_nr_nosync(pcid: kern_pcid(asid), addr, nr, stride: pmd);
527	if (cpu_feature_enabled(X86_FEATURE_PTI))
528	invlpgb_flush_user_nr_nosync(pcid: user_pcid(asid), addr, nr, stride: pmd);
529
530	addr += nr << info->stride_shift;
531	} while (addr < info->end);
532
533	finish_asid_transition(info);
534
535	/ Wait for the INVLPGBs kicked off above to finish. /
536	__tlbsync();
537	}
538
539	/*
540	* Given an ASID, flush the corresponding user ASID. We can delay this
541	* until the next time we switch to it.
542	*
543	* See SWITCH_TO_USER_CR3.
544	*/
545	static inline void invalidate_user_asid(u16 asid)
546	{
547	/ There is no user ASID if address space separation is off /
548	if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
549	return;
550
551	/*
552	* We only have a single ASID if PCID is off and the CR3
553	* write will have flushed it.
554	*/
555	if (!cpu_feature_enabled(X86_FEATURE_PCID))
556	return;
557
558	if (!static_cpu_has(X86_FEATURE_PTI))
559	return;
560
561	__set_bit(kern_pcid(asid),
562	(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
563	}
564
565	static void load_new_mm_cr3(pgd_t pgdir, u16 new_asid, unsigned* long lam,
566	bool need_flush)
567	{
568	unsigned long new_mm_cr3;
569
570	if (need_flush) {
571	invalidate_user_asid(asid: new_asid);
572	new_mm_cr3 = build_cr3(pgd: pgdir, asid: new_asid, lam);
573	} else {
574	new_mm_cr3 = build_cr3_noflush(pgd: pgdir, asid: new_asid, lam);
575	}
576
577	/*
578	* Caution: many callers of this function expect
579	* that load_cr3() is serializing and orders TLB
580	* fills with respect to the mm_cpumask writes.
581	*/
582	write_cr3(x: new_mm_cr3);
583	}
584
585	void leave_mm(void)
586	{
587	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
588
589	/*
590	* It's plausible that we're in lazy TLB mode while our mm is init_mm.
591	* If so, our callers still expect us to flush the TLB, but there
592	* aren't any user TLB entries in init_mm to worry about.
593	*
594	* This needs to happen before any other sanity checks due to
595	* intel_idle's shenanigans.
596	*/
597	if (loaded_mm == &init_mm)
598	return;
599
600	/ Warn if we're not lazy. /
601	WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
602
603	switch_mm(NULL, next: &init_mm, NULL);
604	}
605	EXPORT_SYMBOL_GPL(leave_mm);
606
607	void switch_mm(struct mm_struct prev, struct* mm_struct *next,
608	struct task_struct *tsk)
609	{
610	unsigned long flags;
611
612	local_irq_save(flags);
613	switch_mm_irqs_off(NULL, next, tsk);
614	local_irq_restore(flags);
615	}
616
617	/*
618	* Invoked from return to user/guest by a task that opted-in to L1D
619	* flushing but ended up running on an SMT enabled core due to wrong
620	* affinity settings or CPU hotplug. This is part of the paranoid L1D flush
621	* contract which this task requested.
622	*/
623	static void l1d_flush_force_sigbus(struct callback_head *ch)
624	{
625	force_sig(SIGBUS);
626	}
627
628	static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
629	struct task_struct *next)
630	{
631	/ Flush L1D if the outgoing task requests it /
632	if (prev_mm & LAST_USER_MM_L1D_FLUSH)
633	wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
634
635	/ Check whether the incoming task opted in for L1D flush /
636	if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH)))
637	return;
638
639	/*
640	* Validate that it is not running on an SMT sibling as this would
641	* make the exercise pointless because the siblings share L1D. If
642	* it runs on a SMT sibling, notify it with SIGBUS on return to
643	* user/guest
644	*/
645	if (this_cpu_read(cpu_info.smt_active)) {
646	clear_ti_thread_flag(ti: &next->thread_info, TIF_SPEC_L1D_FLUSH);
647	next->l1d_flush_kill.func = l1d_flush_force_sigbus;
648	task_work_add(task: next, twork: &next->l1d_flush_kill, mode: TWA_RESUME);
649	}
650	}
651
652	static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next)
653	{
654	unsigned long next_tif = read_task_thread_flags(next);
655	unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK;
656
657	/*
658	* Ensure that the bit shift above works as expected and the two flags
659	* end up in bit 0 and 1.
660	*/
661	BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + `1`);
662
663	return (unsigned long)next->mm \| spec_bits;
664	}
665
666	static void cond_mitigation(struct task_struct *next)
667	{
668	unsigned long prev_mm, next_mm;
669
670	if (!next \|\| !next->mm)
671	return;
672
673	next_mm = mm_mangle_tif_spec_bits(next);
674	prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec);
675
676	/*
677	* Avoid user->user BTB/RSB poisoning by flushing them when switching
678	* between processes. This stops one process from doing Spectre-v2
679	* attacks on another.
680	*
681	* Both, the conditional and the always IBPB mode use the mm
682	* pointer to avoid the IBPB when switching between tasks of the
683	* same process. Using the mm pointer instead of mm->context.ctx_id
684	* opens a hypothetical hole vs. mm_struct reuse, which is more or
685	* less impossible to control by an attacker. Aside of that it
686	* would only affect the first schedule so the theoretically
687	* exposed data is not really interesting.
688	*/
689	if (static_branch_likely(&switch_mm_cond_ibpb)) {
690	/*
691	* This is a bit more complex than the always mode because
692	* it has to handle two cases:
693	*
694	* 1) Switch from a user space task (potential attacker)
695	* which has TIF_SPEC_IB set to a user space task
696	* (potential victim) which has TIF_SPEC_IB not set.
697	*
698	* 2) Switch from a user space task (potential attacker)
699	* which has TIF_SPEC_IB not set to a user space task
700	* (potential victim) which has TIF_SPEC_IB set.
701	*
702	* This could be done by unconditionally issuing IBPB when
703	* a task which has TIF_SPEC_IB set is either scheduled in
704	* or out. Though that results in two flushes when:
705	*
706	* - the same user space task is scheduled out and later
707	* scheduled in again and only a kernel thread ran in
708	* between.
709	*
710	* - a user space task belonging to the same process is
711	* scheduled in after a kernel thread ran in between
712	*
713	* - a user space task belonging to the same process is
714	* scheduled in immediately.
715	*
716	* Optimize this with reasonably small overhead for the
717	* above cases. Mangle the TIF_SPEC_IB bit into the mm
718	* pointer of the incoming task which is stored in
719	* cpu_tlbstate.last_user_mm_spec for comparison.
720	*
721	* Issue IBPB only if the mm's are different and one or
722	* both have the IBPB bit set.
723	*/
724	if (next_mm != prev_mm &&
725	(next_mm \| prev_mm) & LAST_USER_MM_IBPB)
726	indirect_branch_prediction_barrier();
727	}
728
729	if (static_branch_unlikely(&switch_mm_always_ibpb)) {
730	/*
731	* Only flush when switching to a user space task with a
732	* different context than the user space task which ran
733	* last on this CPU.
734	*/
735	if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)
736	indirect_branch_prediction_barrier();
737	}
738
739	if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) {
740	/*
741	* Flush L1D when the outgoing task requested it and/or
742	* check whether the incoming task requested L1D flushing
743	* and ended up on an SMT sibling.
744	*/
745	if (unlikely((prev_mm \| next_mm) & LAST_USER_MM_L1D_FLUSH))
746	l1d_flush_evaluate(prev_mm, next_mm, next);
747	}
748
749	this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm);
750	}
751
752	#ifdef CONFIG_PERF_EVENTS
753	static inline void cr4_update_pce_mm(struct mm_struct *mm)
754	{
755	if (static_branch_unlikely(&rdpmc_always_available_key) \|\|
756	(!static_branch_unlikely(&rdpmc_never_available_key) &&
757	atomic_read(v: &mm->context.perf_rdpmc_allowed))) {
758	/*
759	* Clear the existing dirty counters to
760	* prevent the leak for an RDPMC task.
761	*/
762	perf_clear_dirty_counters();
763	cr4_set_bits_irqsoff(X86_CR4_PCE);
764	} else
765	cr4_clear_bits_irqsoff(X86_CR4_PCE);
766	}
767
768	void cr4_update_pce(void *ignored)
769	{
770	cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
771	}
772
773	#else
774	static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
775	#endif
776
777	/*
778	* This optimizes when not actually switching mm's. Some architectures use the
779	* 'unused' argument for this optimization, but x86 must use
780	* 'cpu_tlbstate.loaded_mm' instead because it does not always keep
781	* 'current->active_mm' up to date.
782	*/
783	void switch_mm_irqs_off(struct mm_struct unused, struct* mm_struct *next,
784	struct task_struct *tsk)
785	{
786	struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
787	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
788	bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
789	unsigned cpu = smp_processor_id();
790	unsigned long new_lam;
791	struct new_asid ns;
792	u64 next_tlb_gen;
793
794
795	/ We don't want flush_tlb_func() to run concurrently with us. /
796	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
797	WARN_ON_ONCE(!irqs_disabled());
798
799	/*
800	* Verify that CR3 is what we think it is. This will catch
801	* hypothetical buggy code that directly switches to swapper_pg_dir
802	* without going through leave_mm() / switch_mm_irqs_off() or that
803	* does something like write_cr3(read_cr3_pa()).
804	*
805	* Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
806	* isn't free.
807	*/
808	#ifdef CONFIG_DEBUG_VM
809	if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
810	tlbstate_lam_cr3_mask()))) {
811	/*
812	* If we were to BUG here, we'd be very likely to kill
813	* the system so hard that we don't see the call trace.
814	* Try to recover instead by ignoring the error and doing
815	* a global flush to minimize the chance of corruption.
816	*
817	* (This is far from being a fully correct recovery.
818	* Architecturally, the CPU could prefetch something
819	* back into an incorrect ASID slot and leave it there
820	* to cause trouble down the road. It's better than
821	* nothing, though.)
822	*/
823	__flush_tlb_all();
824	}
825	#endif
826	if (was_lazy)
827	this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
828
829	/*
830	* The membarrier system call requires a full memory barrier and
831	* core serialization before returning to user-space, after
832	* storing to rq->curr, when changing mm. This is because
833	* membarrier() sends IPIs to all CPUs that are in the target mm
834	* to make them issue memory barriers. However, if another CPU
835	* switches to/from the target mm concurrently with
836	* membarrier(), it can cause that CPU not to receive an IPI
837	* when it really should issue a memory barrier. Writing to CR3
838	* provides that full memory barrier and core serializing
839	* instruction.
840	*/
841	if (prev == next) {
842	/ Not actually switching mm's /
843	VM_WARN_ON(is_dyn_asid(prev_asid) &&
844	this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
845	next->context.ctx_id);
846
847	/*
848	* If this races with another thread that enables lam, 'new_lam'
849	* might not match tlbstate_lam_cr3_mask().
850	*/
851
852	/*
853	* Even in lazy TLB mode, the CPU should stay set in the
854	* mm_cpumask. The TLB shootdown code can figure out from
855	* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
856	*/
857	if (IS_ENABLED(CONFIG_DEBUG_VM) &&
858	WARN_ON_ONCE(prev != &init_mm && !is_notrack_mm(prev) &&
859	!cpumask_test_cpu(cpu, mm_cpumask(next))))
860	cpumask_set_cpu(cpu, dstp: mm_cpumask(mm: next));
861
862	/ Check if the current mm is transitioning to a global ASID /
863	if (mm_needs_global_asid(mm: next, asid: prev_asid)) {
864	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
865	ns = choose_new_asid(next, next_tlb_gen);
866	goto reload_tlb;
867	}
868
869	/*
870	* Broadcast TLB invalidation keeps this ASID up to date
871	* all the time.
872	*/
873	if (is_global_asid(asid: prev_asid))
874	return;
875
876	/*
877	* If the CPU is not in lazy TLB mode, we are just switching
878	* from one thread in a process to another thread in the same
879	* process. No TLB flush required.
880	*/
881	if (!was_lazy)
882	return;
883
884	/*
885	* Read the tlb_gen to check whether a flush is needed.
886	* If the TLB is up to date, just use it.
887	* The barrier synchronizes with the tlb_gen increment in
888	* the TLB shootdown code.
889	*/
890	smp_mb();
891	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
892	if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
893	next_tlb_gen)
894	return;
895
896	/*
897	* TLB contents went out of date while we were in lazy
898	* mode. Fall through to the TLB switching code below.
899	*/
900	ns.asid = prev_asid;
901	ns.need_flush = true;
902	} else {
903	/*
904	* Apply process to process speculation vulnerability
905	* mitigations if applicable.
906	*/
907	cond_mitigation(next: tsk);
908
909	/*
910	* Indicate that CR3 is about to change. nmi_uaccess_okay()
911	* and others are sensitive to the window where mm_cpumask(),
912	* CR3 and cpu_tlbstate.loaded_mm are not all in sync.
913	*/
914	this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
915
916	/*
917	* Make sure this CPU is set in mm_cpumask() such that we'll
918	* receive invalidation IPIs.
919	*
920	* Rely on the smp_mb() implied by cpumask_set_cpu()'s atomic
921	* operation, or explicitly provide one. Such that:
922	*
923	* switch_mm_irqs_off() flush_tlb_mm_range()
924	* smp_store_release(loaded_mm, SWITCHING); atomic64_inc_return(tlb_gen)
925	* smp_mb(); // here // smp_mb() implied
926	* atomic64_read(tlb_gen); this_cpu_read(loaded_mm);
927	*
928	* we properly order against flush_tlb_mm_range(), where the
929	* loaded_mm load can happen in mative_flush_tlb_multi() ->
930	* should_flush_tlb().
931	*
932	* This way switch_mm() must see the new tlb_gen or
933	* flush_tlb_mm_range() must see the new loaded_mm, or both.
934	*/
935	if (next != &init_mm && !cpumask_test_cpu(cpu, cpumask: mm_cpumask(mm: next)))
936	cpumask_set_cpu(cpu, dstp: mm_cpumask(mm: next));
937	else
938	smp_mb();
939
940	next_tlb_gen = atomic64_read(v: &next->context.tlb_gen);
941
942	ns = choose_new_asid(next, next_tlb_gen);
943	}
944
945	reload_tlb:
946	new_lam = mm_lam_cr3_mask(mm: next);
947	if (ns.need_flush) {
948	VM_WARN_ON_ONCE(is_global_asid(ns.asid));
949	this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);
950	this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);
951	load_new_mm_cr3(pgdir: next->pgd, new_asid: ns.asid, lam: new_lam, need_flush: true);
952
953	trace_tlb_flush(reason: TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
954	} else {
955	/ The new ASID is already up to date. /
956	load_new_mm_cr3(pgdir: next->pgd, new_asid: ns.asid, lam: new_lam, need_flush: false);
957
958	trace_tlb_flush(reason: TLB_FLUSH_ON_TASK_SWITCH, pages: `0`);
959	}
960
961	/ Make sure we write CR3 before loaded_mm. /
962	barrier();
963
964	this_cpu_write(cpu_tlbstate.loaded_mm, next);
965	this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid);
966	cpu_tlbstate_update_lam(lam: new_lam, mm_untag_mask(mm: next));
967
968	if (next != prev) {
969	cr4_update_pce_mm(mm: next);
970	switch_ldt(prev, next);
971	}
972	}
973
974	/*
975	* Please ignore the name of this function. It should be called
976	* switch_to_kernel_thread().
977	*
978	* enter_lazy_tlb() is a hint from the scheduler that we are entering a
979	* kernel thread or other context without an mm. Acceptable implementations
980	* include doing nothing whatsoever, switching to init_mm, or various clever
981	* lazy tricks to try to minimize TLB flushes.
982	*
983	* The scheduler reserves the right to call enter_lazy_tlb() several times
984	* in a row. It will notify us that we're going back to a real mm by
985	* calling switch_mm_irqs_off().
986	*/
987	void enter_lazy_tlb(struct mm_struct mm, struct* task_struct *tsk)
988	{
989	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
990	return;
991
992	this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
993	}
994
995	/*
996	* Using a temporary mm allows to set temporary mappings that are not accessible
997	* by other CPUs. Such mappings are needed to perform sensitive memory writes
998	* that override the kernel memory protections (e.g., W^X), without exposing the
999	* temporary page-table mappings that are required for these write operations to
1000	* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1001	* mapping is torn down. Temporary mms can also be used for EFI runtime service
1002	* calls or similar functionality.
1003	*
1004	* It is illegal to schedule while using a temporary mm -- the context switch
1005	* code is unaware of the temporary mm and does not know how to context switch.
1006	* Use a real (non-temporary) mm in a kernel thread if you need to sleep.
1007	*
1008	* Note: For sensitive memory writes, the temporary mm needs to be used
1009	* exclusively by a single core, and IRQs should be disabled while the
1010	* temporary mm is loaded, thereby preventing interrupt handler bugs from
1011	* overriding the kernel memory protection.
1012	*/
1013	struct mm_struct use_temporary_mm(struct* mm_struct *temp_mm)
1014	{
1015	struct mm_struct *prev_mm;
1016
1017	lockdep_assert_preemption_disabled();
1018	guard(irqsave)();
1019
1020	/*
1021	* Make sure not to be in TLB lazy mode, as otherwise we'll end up
1022	* with a stale address space WITHOUT being in lazy mode after
1023	* restoring the previous mm.
1024	*/
1025	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1026	leave_mm();
1027
1028	prev_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1029	switch_mm_irqs_off(NULL, next: temp_mm, current);
1030
1031	/*
1032	* If breakpoints are enabled, disable them while the temporary mm is
1033	* used. Userspace might set up watchpoints on addresses that are used
1034	* in the temporary mm, which would lead to wrong signals being sent or
1035	* crashes.
1036	*
1037	* Note that breakpoints are not disabled selectively, which also causes
1038	* kernel breakpoints (e.g., perf's) to be disabled. This might be
1039	* undesirable, but still seems reasonable as the code that runs in the
1040	* temporary mm should be short.
1041	*/
1042	if (hw_breakpoint_active())
1043	hw_breakpoint_disable();
1044
1045	return prev_mm;
1046	}
1047
1048	void unuse_temporary_mm(struct mm_struct *prev_mm)
1049	{
1050	lockdep_assert_preemption_disabled();
1051	guard(irqsave)();
1052
1053	/ Clear the cpumask, to indicate no TLB flushing is needed anywhere /
1054	cpumask_clear_cpu(smp_processor_id(), dstp: mm_cpumask(this_cpu_read(cpu_tlbstate.loaded_mm)));
1055
1056	switch_mm_irqs_off(NULL, next: prev_mm, current);
1057
1058	/*
1059	* Restore the breakpoints if they were disabled before the temporary mm
1060	* was loaded.
1061	*/
1062	if (hw_breakpoint_active())
1063	hw_breakpoint_restore();
1064	}
1065
1066	/*
1067	* Call this when reinitializing a CPU. It fixes the following potential
1068	* problems:
1069	*
1070	* - The ASID changed from what cpu_tlbstate thinks it is (most likely
1071	* because the CPU was taken down and came back up with CR3's PCID
1072	* bits clear. CPU hotplug can do this.
1073	*
1074	* - The TLB contains junk in slots corresponding to inactive ASIDs.
1075	*
1076	* - The CPU went so far out to lunch that it may have missed a TLB
1077	* flush.
1078	*/
1079	void initialize_tlbstate_and_flush(void)
1080	{
1081	int i;
1082	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1083	u64 tlb_gen = atomic64_read(v: &init_mm.context.tlb_gen);
1084	unsigned long lam = mm_lam_cr3_mask(mm);
1085	unsigned long cr3 = __read_cr3();
1086
1087	/ Assert that CR3 already references the right mm. /
1088	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
1089
1090	/ LAM expected to be disabled /
1091	WARN_ON(cr3 & (X86_CR3_LAM_U48 \| X86_CR3_LAM_U57));
1092	WARN_ON(lam);
1093
1094	/*
1095	* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
1096	* doesn't work like other CR4 bits because it can only be set from
1097	* long mode.)
1098	*/
1099	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
1100	!(cr4_read_shadow() & X86_CR4_PCIDE));
1101
1102	/ Disable LAM, force ASID 0 and force a TLB flush. /
1103	write_cr3(x: build_cr3(pgd: mm->pgd, asid: `0`, lam: `0`));
1104
1105	/ Reinitialize tlbstate. /
1106	this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
1107	this_cpu_write(cpu_tlbstate.loaded_mm_asid, `0`);
1108	this_cpu_write(cpu_tlbstate.next_asid, `1`);
1109	this_cpu_write(cpu_tlbstate.ctxs[`0`].ctx_id, mm->context.ctx_id);
1110	this_cpu_write(cpu_tlbstate.ctxs[`0`].tlb_gen, tlb_gen);
1111	cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
1112
1113	for (i = `1`; i < TLB_NR_DYN_ASIDS; i++)
1114	this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, `0`);
1115	}
1116
1117	/*
1118	* flush_tlb_func()'s memory ordering requirement is that any
1119	* TLB fills that happen after we flush the TLB are ordered after we
1120	* read active_mm's tlb_gen. We don't need any explicit barriers
1121	* because all x86 flush operations are serializing and the
1122	* atomic64_read operation won't be reordered by the compiler.
1123	*/
1124	static void flush_tlb_func(void *info)
1125	{
1126	/*
1127	* We have three different tlb_gen values in here. They are:
1128	*
1129	* - mm_tlb_gen: the latest generation.
1130	* - local_tlb_gen: the generation that this CPU has already caught
1131	* up to.
1132	* - f->new_tlb_gen: the generation that the requester of the flush
1133	* wants us to catch up to.
1134	*/
1135	const struct flush_tlb_info *f = info;
1136	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1137	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1138	u64 local_tlb_gen;
1139	bool local = smp_processor_id() == f->initiating_cpu;
1140	unsigned long nr_invalidate = `0`;
1141	u64 mm_tlb_gen;
1142
1143	/ This code cannot presently handle being reentered. /
1144	VM_WARN_ON(!irqs_disabled());
1145
1146	if (!local) {
1147	inc_irq_stat(irq_tlb_count);
1148	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1149	}
1150
1151	/ The CPU was left in the mm_cpumask of the target mm. Clear it. /
1152	if (f->mm && f->mm != loaded_mm) {
1153	cpumask_clear_cpu(raw_smp_processor_id(), dstp: mm_cpumask(mm: f->mm));
1154	trace_tlb_flush(reason: TLB_REMOTE_WRONG_CPU, pages: `0`);
1155	return;
1156	}
1157
1158	if (unlikely(loaded_mm == &init_mm))
1159	return;
1160
1161	/ Reload the ASID if transitioning into or out of a global ASID /
1162	if (mm_needs_global_asid(mm: loaded_mm, asid: loaded_mm_asid)) {
1163	switch_mm_irqs_off(NULL, next: loaded_mm, NULL);
1164	loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1165	}
1166
1167	/ Broadcast ASIDs are always kept up to date with INVLPGB. /
1168	if (is_global_asid(asid: loaded_mm_asid))
1169	return;
1170
1171	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
1172	loaded_mm->context.ctx_id);
1173
1174	if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
1175	/*
1176	* We're in lazy mode. We need to at least flush our
1177	* paging-structure cache to avoid speculatively reading
1178	* garbage into our TLB. Since switching to init_mm is barely
1179	* slower than a minimal flush, just switch to init_mm.
1180	*
1181	* This should be rare, with native_flush_tlb_multi() skipping
1182	* IPIs to lazy TLB mode CPUs.
1183	*/
1184	switch_mm_irqs_off(NULL, next: &init_mm, NULL);
1185	return;
1186	}
1187
1188	local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
1189
1190	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
1191	f->new_tlb_gen <= local_tlb_gen)) {
1192	/*
1193	* The TLB is already up to date in respect to f->new_tlb_gen.
1194	* While the core might be still behind mm_tlb_gen, checking
1195	* mm_tlb_gen unnecessarily would have negative caching effects
1196	* so avoid it.
1197	*/
1198	return;
1199	}
1200
1201	/*
1202	* Defer mm_tlb_gen reading as long as possible to avoid cache
1203	* contention.
1204	*/
1205	mm_tlb_gen = atomic64_read(v: &loaded_mm->context.tlb_gen);
1206
1207	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
1208	/*
1209	* There's nothing to do: we're already up to date. This can
1210	* happen if two concurrent flushes happen -- the first flush to
1211	* be handled can catch us all the way up, leaving no work for
1212	* the second flush.
1213	*/
1214	goto done;
1215	}
1216
1217	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
1218	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
1219
1220	/*
1221	* If we get to this point, we know that our TLB is out of date.
1222	* This does not strictly imply that we need to flush (it's
1223	* possible that f->new_tlb_gen <= local_tlb_gen), but we're
1224	* going to need to flush in the very near future, so we might
1225	* as well get it over with.
1226	*
1227	* The only question is whether to do a full or partial flush.
1228	*
1229	* We do a partial flush if requested and two extra conditions
1230	* are met:
1231	*
1232	* 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that
1233	* we've always done all needed flushes to catch up to
1234	* local_tlb_gen. If, for example, local_tlb_gen == 2 and
1235	* f->new_tlb_gen == 3, then we know that the flush needed to bring
1236	* us up to date for tlb_gen 3 is the partial flush we're
1237	* processing.
1238	*
1239	* As an example of why this check is needed, suppose that there
1240	* are two concurrent flushes. The first is a full flush that
1241	* changes context.tlb_gen from 1 to 2. The second is a partial
1242	* flush that changes context.tlb_gen from 2 to 3. If they get
1243	* processed on this CPU in reverse order, we'll see
1244	* local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
1245	* If we were to use __flush_tlb_one_user() and set local_tlb_gen to
1246	* 3, we'd be break the invariant: we'd update local_tlb_gen above
1247	* 1 without the full flush that's needed for tlb_gen 2.
1248	*
1249	* 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
1250	* Partial TLB flushes are not all that much cheaper than full TLB
1251	* flushes, so it seems unlikely that it would be a performance win
1252	* to do a partial flush if that won't bring our TLB fully up to
1253	* date. By doing a full flush instead, we can increase
1254	* local_tlb_gen all the way to mm_tlb_gen and we can probably
1255	* avoid another flush in the very near future.
1256	*/
1257	if (f->end != TLB_FLUSH_ALL &&
1258	f->new_tlb_gen == local_tlb_gen + `1` &&
1259	f->new_tlb_gen == mm_tlb_gen) {
1260	/ Partial flush /
1261	unsigned long addr = f->start;
1262
1263	/ Partial flush cannot have invalid generations /
1264	VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
1265
1266	/ Partial flush must have valid mm /
1267	VM_WARN_ON(f->mm == NULL);
1268
1269	nr_invalidate = (f->end - f->start) >> f->stride_shift;
1270
1271	while (addr < f->end) {
1272	flush_tlb_one_user(addr);
1273	addr += `1UL` << f->stride_shift;
1274	}
1275	if (local)
1276	count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
1277	} else {
1278	/ Full flush. /
1279	nr_invalidate = TLB_FLUSH_ALL;
1280
1281	flush_tlb_local();
1282	if (local)
1283	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
1284	}
1285
1286	/ Both paths above update our state to mm_tlb_gen. /
1287	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
1288
1289	/ Tracing is done in a unified manner to reduce the code size /
1290	done:
1291	trace_tlb_flush(reason: !local ? TLB_REMOTE_SHOOTDOWN :
1292	(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
1293	TLB_LOCAL_MM_SHOOTDOWN,
1294	pages: nr_invalidate);
1295	}
1296
1297	static bool should_flush_tlb(int cpu, void *data)
1298	{
1299	struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);
1300	struct flush_tlb_info *info = data;
1301
1302	/*
1303	* Order the 'loaded_mm' and 'is_lazy' against their
1304	* write ordering in switch_mm_irqs_off(). Ensure
1305	* 'is_lazy' is at least as new as 'loaded_mm'.
1306	*/
1307	smp_rmb();
1308
1309	/ Lazy TLB will get flushed at the next context switch. /
1310	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
1311	return false;
1312
1313	/ No mm means kernel memory flush. /
1314	if (!info->mm)
1315	return true;
1316
1317	/*
1318	* While switching, the remote CPU could have state from
1319	* either the prev or next mm. Assume the worst and flush.
1320	*/
1321	if (loaded_mm == LOADED_MM_SWITCHING)
1322	return true;
1323
1324	/ The target mm is loaded, and the CPU is not lazy. /
1325	if (loaded_mm == info->mm)
1326	return true;
1327
1328	/ In cpumask, but not the loaded mm? Periodically remove by flushing. /
1329	if (info->trim_cpumask)
1330	return true;
1331
1332	return false;
1333	}
1334
1335	static bool should_trim_cpumask(struct mm_struct *mm)
1336	{
1337	if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
1338	WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
1339	return true;
1340	}
1341	return false;
1342	}
1343
1344	DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
1345	EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
1346
1347	STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
1348	const struct flush_tlb_info *info)
1349	{
1350	/*
1351	* Do accounting and tracing. Note that there are (and have always been)
1352	* cases in which a remote TLB flush will be traced, but eventually
1353	* would not happen.
1354	*/
1355	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1356	if (info->end == TLB_FLUSH_ALL)
1357	trace_tlb_flush(reason: TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
1358	else
1359	trace_tlb_flush(reason: TLB_REMOTE_SEND_IPI,
1360	pages: (info->end - info->start) >> PAGE_SHIFT);
1361
1362	/*
1363	* If no page tables were freed, we can skip sending IPIs to
1364	* CPUs in lazy TLB mode. They will flush the CPU themselves
1365	* at the next context switch.
1366	*
1367	* However, if page tables are getting freed, we need to send the
1368	* IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
1369	* up on the new contents of what used to be page tables, while
1370	* doing a speculative memory access.
1371	*/
1372	if (info->freed_tables \|\| mm_in_asid_transition(mm: info->mm))
1373	on_each_cpu_mask(mask: cpumask, func: flush_tlb_func, info: (void *)info, wait: true);
1374	else
1375	on_each_cpu_cond_mask(cond_func: should_flush_tlb, func: flush_tlb_func,
1376	info: (void *)info, wait: `1`, mask: cpumask);
1377	}
1378
1379	void flush_tlb_multi(const struct cpumask *cpumask,
1380	const struct flush_tlb_info *info)
1381	{
1382	__flush_tlb_multi(cpumask, info);
1383	}
1384
1385	/*
1386	* See Documentation/arch/x86/tlb.rst for details. We choose 33
1387	* because it is large enough to cover the vast majority (at
1388	* least 95%) of allocations, and is small enough that we are
1389	* confident it will not cause too much overhead. Each single
1390	* flush is about 100 ns, so this caps the maximum overhead at
1391	* _about_ 3,000 ns.
1392	*
1393	* This is in units of pages.
1394	*/
1395	unsigned long tlb_single_page_flush_ceiling __read_mostly = `33`;
1396
1397	static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
1398
1399	#ifdef CONFIG_DEBUG_VM
1400	static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
1401	#endif
1402
1403	static struct flush_tlb_info get_flush_tlb_info(struct* mm_struct *mm,
1404	unsigned long start, unsigned long end,
1405	unsigned int stride_shift, bool freed_tables,
1406	u64 new_tlb_gen)
1407	{
1408	struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info);
1409
1410	#ifdef CONFIG_DEBUG_VM
1411	/*
1412	* Ensure that the following code is non-reentrant and flush_tlb_info
1413	* is not overwritten. This means no TLB flushing is initiated by
1414	* interrupt handlers and machine-check exception handlers.
1415	*/
1416	BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != `1`);
1417	#endif
1418
1419	/*
1420	* If the number of flushes is so large that a full flush
1421	* would be faster, do a full flush.
1422	*/
1423	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {
1424	start = `0`;
1425	end = TLB_FLUSH_ALL;
1426	}
1427
1428	info->start = start;
1429	info->end = end;
1430	info->mm = mm;
1431	info->stride_shift = stride_shift;
1432	info->freed_tables = freed_tables;
1433	info->new_tlb_gen = new_tlb_gen;
1434	info->initiating_cpu = smp_processor_id();
1435	info->trim_cpumask = `0`;
1436
1437	return info;
1438	}
1439
1440	static void put_flush_tlb_info(void)
1441	{
1442	#ifdef CONFIG_DEBUG_VM
1443	/ Complete reentrancy prevention checks /
1444	barrier();
1445	this_cpu_dec(flush_tlb_info_idx);
1446	#endif
1447	}
1448
1449	void flush_tlb_mm_range(struct mm_struct mm, unsigned* long start,
1450	unsigned long end, unsigned int stride_shift,
1451	bool freed_tables)
1452	{
1453	struct flush_tlb_info *info;
1454	int cpu = get_cpu();
1455	u64 new_tlb_gen;
1456
1457	/ This is also a barrier that synchronizes with switch_mm(). /
1458	new_tlb_gen = inc_mm_tlb_gen(mm);
1459
1460	info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
1461	new_tlb_gen);
1462
1463	/*
1464	* flush_tlb_multi() is not optimized for the common case in which only
1465	* a local TLB flush is needed. Optimize this use-case by calling
1466	* flush_tlb_func_local() directly in this case.
1467	*/
1468	if (mm_global_asid(mm)) {
1469	broadcast_tlb_flush(info);
1470	} else if (cpumask_any_but(mask: mm_cpumask(mm), cpu) < nr_cpu_ids) {
1471	info->trim_cpumask = should_trim_cpumask(mm);
1472	flush_tlb_multi(cpumask: mm_cpumask(mm), info);
1473	consider_global_asid(mm);
1474	} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
1475	lockdep_assert_irqs_enabled();
1476	local_irq_disable();
1477	flush_tlb_func(info);
1478	local_irq_enable();
1479	}
1480
1481	put_flush_tlb_info();
1482	put_cpu();
1483	mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
1484	}
1485
1486	static void do_flush_tlb_all(void *info)
1487	{
1488	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
1489	__flush_tlb_all();
1490	}
1491
1492	void flush_tlb_all(void)
1493	{
1494	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
1495
1496	/ First try (faster) hardware-assisted TLB invalidation. /
1497	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1498	invlpgb_flush_all();
1499	else
1500	/ Fall back to the IPI-based invalidation. /
1501	on_each_cpu(func: do_flush_tlb_all, NULL, wait: `1`);
1502	}
1503
1504	/ Flush an arbitrarily large range of memory with INVLPGB. /
1505	static void invlpgb_kernel_range_flush(struct flush_tlb_info *info)
1506	{
1507	unsigned long addr, nr;
1508
1509	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {
1510	nr = (info->end - addr) >> PAGE_SHIFT;
1511
1512	/*
1513	* INVLPGB has a limit on the size of ranges it can
1514	* flush. Break up large flushes.
1515	*/
1516	nr = clamp_val(nr, `1`, invlpgb_count_max);
1517
1518	invlpgb_flush_addr_nosync(addr, nr);
1519	}
1520	__tlbsync();
1521	}
1522
1523	static void do_kernel_range_flush(void *info)
1524	{
1525	struct flush_tlb_info *f = info;
1526	unsigned long addr;
1527
1528	/ flush range by one by one 'invlpg' /
1529	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
1530	flush_tlb_one_kernel(addr);
1531	}
1532
1533	static void kernel_tlb_flush_all(struct flush_tlb_info *info)
1534	{
1535	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1536	invlpgb_flush_all();
1537	else
1538	on_each_cpu(func: do_flush_tlb_all, NULL, wait: `1`);
1539	}
1540
1541	static void kernel_tlb_flush_range(struct flush_tlb_info *info)
1542	{
1543	if (cpu_feature_enabled(X86_FEATURE_INVLPGB))
1544	invlpgb_kernel_range_flush(info);
1545	else
1546	on_each_cpu(func: do_kernel_range_flush, info, wait: `1`);
1547	}
1548
1549	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
1550	{
1551	struct flush_tlb_info *info;
1552
1553	guard(preempt)();
1554
1555	info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, freed_tables: false,
1556	TLB_GENERATION_INVALID);
1557
1558	if (info->end == TLB_FLUSH_ALL)
1559	kernel_tlb_flush_all(info);
1560	else
1561	kernel_tlb_flush_range(info);
1562
1563	put_flush_tlb_info();
1564	}
1565
1566	/*
1567	* This can be used from process context to figure out what the value of
1568	* CR3 is without needing to do a (slow) __read_cr3().
1569	*
1570	* It's intended to be used for code like KVM that sneakily changes CR3
1571	* and needs to restore it. It needs to be used very carefully.
1572	*/
1573	unsigned long __get_current_cr3_fast(void)
1574	{
1575	unsigned long cr3 =
1576	build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
1577	this_cpu_read(cpu_tlbstate.loaded_mm_asid),
1578	lam: tlbstate_lam_cr3_mask());
1579
1580	/ For now, be very restrictive about when this can be called. /
1581	VM_WARN_ON(in_nmi() \|\| preemptible());
1582
1583	VM_BUG_ON(cr3 != __read_cr3());
1584	return cr3;
1585	}
1586	EXPORT_SYMBOL_FOR_KVM(__get_current_cr3_fast);
1587
1588	/*
1589	* Flush one page in the kernel mapping
1590	*/
1591	void flush_tlb_one_kernel(unsigned long addr)
1592	{
1593	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1594
1595	/*
1596	* If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
1597	* paravirt equivalent. Even with PCID, this is sufficient: we only
1598	* use PCID if we also use global PTEs for the kernel mapping, and
1599	* INVLPG flushes global translations across all address spaces.
1600	*
1601	* If PTI is on, then the kernel is mapped with non-global PTEs, and
1602	* __flush_tlb_one_user() will flush the given address for the current
1603	* kernel address space and for its usermode counterpart, but it does
1604	* not flush it for other address spaces.
1605	*/
1606	flush_tlb_one_user(addr);
1607
1608	if (!static_cpu_has(X86_FEATURE_PTI))
1609	return;
1610
1611	/*
1612	* See above. We need to propagate the flush to all other address
1613	* spaces. In principle, we only need to propagate it to kernelmode
1614	* address spaces, but the extra bookkeeping we would need is not
1615	* worth it.
1616	*/
1617	this_cpu_write(cpu_tlbstate.invalidate_other, true);
1618	}
1619
1620	/*
1621	* Flush one page in the user mapping
1622	*/
1623	STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
1624	{
1625	u32 loaded_mm_asid;
1626	bool cpu_pcide;
1627
1628	/ Flush 'addr' from the kernel PCID: /
1629	invlpg(addr);
1630
1631	/ If PTI is off there is no user PCID and nothing to flush. /
1632	if (!static_cpu_has(X86_FEATURE_PTI))
1633	return;
1634
1635	loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1636	cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE;
1637
1638	/*
1639	* invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check
1640	* 'cpu_pcide' to ensure that this CPU will not trigger those
1641	* #GP's even if called before CR4.PCIDE has been initialized.
1642	*/
1643	if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
1644	invpcid_flush_one(pcid: user_pcid(asid: loaded_mm_asid), addr);
1645	else
1646	invalidate_user_asid(asid: loaded_mm_asid);
1647	}
1648
1649	void flush_tlb_one_user(unsigned long addr)
1650	{
1651	__flush_tlb_one_user(addr);
1652	}
1653
1654	/*
1655	* Flush everything
1656	*/
1657	STATIC_NOPV void native_flush_tlb_global(void)
1658	{
1659	unsigned long flags;
1660
1661	if (static_cpu_has(X86_FEATURE_INVPCID)) {
1662	/*
1663	* Using INVPCID is considerably faster than a pair of writes
1664	* to CR4 sandwiched inside an IRQ flag save/restore.
1665	*
1666	* Note, this works with CR4.PCIDE=0 or 1.
1667	*/
1668	invpcid_flush_all();
1669	return;
1670	}
1671
1672	/*
1673	* Read-modify-write to CR4 - protect it from preemption and
1674	* from interrupts. (Use the raw variant because this code can
1675	* be called from deep inside debugging code.)
1676	*/
1677	raw_local_irq_save(flags);
1678
1679	__native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4));
1680
1681	raw_local_irq_restore(flags);
1682	}
1683
1684	/*
1685	* Flush the entire current user mapping
1686	*/
1687	STATIC_NOPV void native_flush_tlb_local(void)
1688	{
1689	/*
1690	* Preemption or interrupts must be disabled to protect the access
1691	* to the per CPU variable and to prevent being preempted between
1692	* read_cr3() and write_cr3().
1693	*/
1694	WARN_ON_ONCE(preemptible());
1695
1696	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1697
1698	/ If current->mm == NULL then the read_cr3() "borrows" an mm /
1699	native_write_cr3(val: __native_read_cr3());
1700	}
1701
1702	void flush_tlb_local(void)
1703	{
1704	__flush_tlb_local();
1705	}
1706
1707	/*
1708	* Flush everything
1709	*/
1710	void __flush_tlb_all(void)
1711	{
1712	/*
1713	* This is to catch users with enabled preemption and the PGE feature
1714	* and don't trigger the warning in __native_flush_tlb().
1715	*/
1716	VM_WARN_ON_ONCE(preemptible());
1717
1718	if (cpu_feature_enabled(X86_FEATURE_PGE)) {
1719	__flush_tlb_global();
1720	} else {
1721	/*
1722	* !PGE -> !PCID (setup_pcid()), thus every flush is total.
1723	*/
1724	flush_tlb_local();
1725	}
1726	}
1727	EXPORT_SYMBOL_FOR_KVM(__flush_tlb_all);
1728
1729	void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
1730	{
1731	struct flush_tlb_info *info;
1732
1733	int cpu = get_cpu();
1734
1735	info = get_flush_tlb_info(NULL, start: `0`, TLB_FLUSH_ALL, stride_shift: `0`, freed_tables: false,
1736	TLB_GENERATION_INVALID);
1737	/*
1738	* flush_tlb_multi() is not optimized for the common case in which only
1739	* a local TLB flush is needed. Optimize this use-case by calling
1740	* flush_tlb_func_local() directly in this case.
1741	*/
1742	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {
1743	invlpgb_flush_all_nonglobals();
1744	batch->unmapped_pages = false;
1745	} else if (cpumask_any_but(mask: &batch->cpumask, cpu) < nr_cpu_ids) {
1746	flush_tlb_multi(cpumask: &batch->cpumask, info);
1747	} else if (cpumask_test_cpu(cpu, cpumask: &batch->cpumask)) {
1748	lockdep_assert_irqs_enabled();
1749	local_irq_disable();
1750	flush_tlb_func(info);
1751	local_irq_enable();
1752	}
1753
1754	cpumask_clear(dstp: &batch->cpumask);
1755
1756	put_flush_tlb_info();
1757	put_cpu();
1758	}
1759
1760	/*
1761	* Blindly accessing user memory from NMI context can be dangerous
1762	* if we're in the middle of switching the current user task or
1763	* switching the loaded mm. It can also be dangerous if we
1764	* interrupted some kernel code that was temporarily using a
1765	* different mm.
1766	*/
1767	bool nmi_uaccess_okay(void)
1768	{
1769	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1770	struct mm_struct *current_mm = current->mm;
1771
1772	VM_WARN_ON_ONCE(!loaded_mm);
1773
1774	/*
1775	* The condition we want to check is
1776	* current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
1777	* if we're running in a VM with shadow paging, and nmi_uaccess_okay()
1778	* is supposed to be reasonably fast.
1779	*
1780	* Instead, we check the almost equivalent but somewhat conservative
1781	* condition below, and we rely on the fact that switch_mm_irqs_off()
1782	* sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
1783	*/
1784	if (loaded_mm != current_mm)
1785	return false;
1786
1787	VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());
1788
1789	return true;
1790	}
1791
1792	static ssize_t tlbflush_read_file(struct file file, char* __user *user_buf,
1793	size_t count, loff_t *ppos)
1794	{
1795	char buf[`32`];
1796	unsigned int len;
1797
1798	len = sprintf(buf, fmt: "%ld\n", tlb_single_page_flush_ceiling);
1799	return simple_read_from_buffer(to: user_buf, count, ppos, from: buf, available: len);
1800	}
1801
1802	static ssize_t tlbflush_write_file(struct file *file,
1803	const char __user user_buf, size_t count, loff_t ppos)
1804	{
1805	char buf[`32`];
1806	ssize_t len;
1807	int ceiling;
1808
1809	len = min(count, sizeof(buf) - `1`);
1810	if (copy_from_user(to: buf, from: user_buf, n: len))
1811	return -EFAULT;
1812
1813	buf[len] = `'\0'`;
1814	if (kstrtoint(s: buf, base: `0`, res: &ceiling))
1815	return -EINVAL;
1816
1817	if (ceiling < `0`)
1818	return -EINVAL;
1819
1820	tlb_single_page_flush_ceiling = ceiling;
1821	return count;
1822	}
1823
1824	static const struct file_operations fops_tlbflush = {
1825	.read = tlbflush_read_file,
1826	.write = tlbflush_write_file,
1827	.llseek = default_llseek,
1828	};
1829
1830	static int __init create_tlb_single_page_flush_ceiling(void)
1831	{
1832	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR \| S_IWUSR,
1833	arch_debugfs_dir, NULL, &fops_tlbflush);
1834	return `0`;
1835	}
1836	late_initcall(create_tlb_single_page_flush_ceiling);
1837

source code of linux/arch/x86/mm/tlb.c