1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright IBM Corp. 1999, 2023 |
4 | */ |
5 | |
6 | #include <linux/cpuhotplug.h> |
7 | #include <linux/sched/task.h> |
8 | #include <linux/errno.h> |
9 | #include <linux/init.h> |
10 | #include <linux/irq.h> |
11 | #include <asm/asm-extable.h> |
12 | #include <asm/pfault.h> |
13 | #include <asm/diag.h> |
14 | |
15 | #define __SUBCODE_MASK 0x0600 |
16 | #define __PF_RES_FIELD 0x8000000000000000UL |
17 | |
18 | /* |
19 | * 'pfault' pseudo page faults routines. |
20 | */ |
21 | static int pfault_disable; |
22 | |
23 | static int __init nopfault(char *str) |
24 | { |
25 | pfault_disable = 1; |
26 | return 1; |
27 | } |
28 | early_param("nopfault" , nopfault); |
29 | |
30 | struct pfault_refbk { |
31 | u16 refdiagc; |
32 | u16 reffcode; |
33 | u16 refdwlen; |
34 | u16 refversn; |
35 | u64 refgaddr; |
36 | u64 refselmk; |
37 | u64 refcmpmk; |
38 | u64 reserved; |
39 | }; |
40 | |
41 | static struct pfault_refbk pfault_init_refbk = { |
42 | .refdiagc = 0x258, |
43 | .reffcode = 0, |
44 | .refdwlen = 5, |
45 | .refversn = 2, |
46 | .refgaddr = __LC_LPP, |
47 | .refselmk = 1UL << 48, |
48 | .refcmpmk = 1UL << 48, |
49 | .reserved = __PF_RES_FIELD |
50 | }; |
51 | |
52 | int __pfault_init(void) |
53 | { |
54 | int rc = -EOPNOTSUPP; |
55 | |
56 | if (pfault_disable) |
57 | return rc; |
58 | diag_stat_inc(DIAG_STAT_X258); |
59 | asm volatile( |
60 | " diag %[refbk],%[rc],0x258\n" |
61 | "0: nopr %%r7\n" |
62 | EX_TABLE(0b, 0b) |
63 | : [rc] "+d" (rc) |
64 | : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) |
65 | : "cc" ); |
66 | return rc; |
67 | } |
68 | |
69 | static struct pfault_refbk pfault_fini_refbk = { |
70 | .refdiagc = 0x258, |
71 | .reffcode = 1, |
72 | .refdwlen = 5, |
73 | .refversn = 2, |
74 | }; |
75 | |
76 | void __pfault_fini(void) |
77 | { |
78 | if (pfault_disable) |
79 | return; |
80 | diag_stat_inc(DIAG_STAT_X258); |
81 | asm volatile( |
82 | " diag %[refbk],0,0x258\n" |
83 | "0: nopr %%r7\n" |
84 | EX_TABLE(0b, 0b) |
85 | : |
86 | : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) |
87 | : "cc" ); |
88 | } |
89 | |
90 | static DEFINE_SPINLOCK(pfault_lock); |
91 | static LIST_HEAD(pfault_list); |
92 | |
93 | #define PF_COMPLETE 0x0080 |
94 | |
95 | /* |
96 | * The mechanism of our pfault code: if Linux is running as guest, runs a user |
97 | * space process and the user space process accesses a page that the host has |
98 | * paged out we get a pfault interrupt. |
99 | * |
100 | * This allows us, within the guest, to schedule a different process. Without |
101 | * this mechanism the host would have to suspend the whole virtual cpu until |
102 | * the page has been paged in. |
103 | * |
104 | * So when we get such an interrupt then we set the state of the current task |
105 | * to uninterruptible and also set the need_resched flag. Both happens within |
106 | * interrupt context(!). If we later on want to return to user space we |
107 | * recognize the need_resched flag and then call schedule(). It's not very |
108 | * obvious how this works... |
109 | * |
110 | * Of course we have a lot of additional fun with the completion interrupt (-> |
111 | * host signals that a page of a process has been paged in and the process can |
112 | * continue to run). This interrupt can arrive on any cpu and, since we have |
113 | * virtual cpus, actually appear before the interrupt that signals that a page |
114 | * is missing. |
115 | */ |
116 | static void pfault_interrupt(struct ext_code ext_code, |
117 | unsigned int param32, unsigned long param64) |
118 | { |
119 | struct task_struct *tsk; |
120 | __u16 subcode; |
121 | pid_t pid; |
122 | |
123 | /* |
124 | * Get the external interruption subcode & pfault initial/completion |
125 | * signal bit. VM stores this in the 'cpu address' field associated |
126 | * with the external interrupt. |
127 | */ |
128 | subcode = ext_code.subcode; |
129 | if ((subcode & 0xff00) != __SUBCODE_MASK) |
130 | return; |
131 | inc_irq_stat(IRQEXT_PFL); |
132 | /* Get the token (= pid of the affected task). */ |
133 | pid = param64 & LPP_PID_MASK; |
134 | rcu_read_lock(); |
135 | tsk = find_task_by_pid_ns(nr: pid, ns: &init_pid_ns); |
136 | if (tsk) |
137 | get_task_struct(t: tsk); |
138 | rcu_read_unlock(); |
139 | if (!tsk) |
140 | return; |
141 | spin_lock(lock: &pfault_lock); |
142 | if (subcode & PF_COMPLETE) { |
143 | /* signal bit is set -> a page has been swapped in by VM */ |
144 | if (tsk->thread.pfault_wait == 1) { |
145 | /* |
146 | * Initial interrupt was faster than the completion |
147 | * interrupt. pfault_wait is valid. Set pfault_wait |
148 | * back to zero and wake up the process. This can |
149 | * safely be done because the task is still sleeping |
150 | * and can't produce new pfaults. |
151 | */ |
152 | tsk->thread.pfault_wait = 0; |
153 | list_del(entry: &tsk->thread.list); |
154 | wake_up_process(tsk); |
155 | put_task_struct(t: tsk); |
156 | } else { |
157 | /* |
158 | * Completion interrupt was faster than initial |
159 | * interrupt. Set pfault_wait to -1 so the initial |
160 | * interrupt doesn't put the task to sleep. |
161 | * If the task is not running, ignore the completion |
162 | * interrupt since it must be a leftover of a PFAULT |
163 | * CANCEL operation which didn't remove all pending |
164 | * completion interrupts. |
165 | */ |
166 | if (task_is_running(tsk)) |
167 | tsk->thread.pfault_wait = -1; |
168 | } |
169 | } else { |
170 | /* signal bit not set -> a real page is missing. */ |
171 | if (WARN_ON_ONCE(tsk != current)) |
172 | goto out; |
173 | if (tsk->thread.pfault_wait == 1) { |
174 | /* Already on the list with a reference: put to sleep */ |
175 | goto block; |
176 | } else if (tsk->thread.pfault_wait == -1) { |
177 | /* |
178 | * Completion interrupt was faster than the initial |
179 | * interrupt (pfault_wait == -1). Set pfault_wait |
180 | * back to zero and exit. |
181 | */ |
182 | tsk->thread.pfault_wait = 0; |
183 | } else { |
184 | /* |
185 | * Initial interrupt arrived before completion |
186 | * interrupt. Let the task sleep. |
187 | * An extra task reference is needed since a different |
188 | * cpu may set the task state to TASK_RUNNING again |
189 | * before the scheduler is reached. |
190 | */ |
191 | get_task_struct(t: tsk); |
192 | tsk->thread.pfault_wait = 1; |
193 | list_add(new: &tsk->thread.list, head: &pfault_list); |
194 | block: |
195 | /* |
196 | * Since this must be a userspace fault, there |
197 | * is no kernel task state to trample. Rely on the |
198 | * return to userspace schedule() to block. |
199 | */ |
200 | __set_current_state(TASK_UNINTERRUPTIBLE); |
201 | set_tsk_need_resched(tsk); |
202 | set_preempt_need_resched(); |
203 | } |
204 | } |
205 | out: |
206 | spin_unlock(lock: &pfault_lock); |
207 | put_task_struct(t: tsk); |
208 | } |
209 | |
210 | static int pfault_cpu_dead(unsigned int cpu) |
211 | { |
212 | struct thread_struct *thread, *next; |
213 | struct task_struct *tsk; |
214 | |
215 | spin_lock_irq(lock: &pfault_lock); |
216 | list_for_each_entry_safe(thread, next, &pfault_list, list) { |
217 | thread->pfault_wait = 0; |
218 | list_del(entry: &thread->list); |
219 | tsk = container_of(thread, struct task_struct, thread); |
220 | wake_up_process(tsk); |
221 | put_task_struct(t: tsk); |
222 | } |
223 | spin_unlock_irq(lock: &pfault_lock); |
224 | return 0; |
225 | } |
226 | |
227 | static int __init pfault_irq_init(void) |
228 | { |
229 | int rc; |
230 | |
231 | rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); |
232 | if (rc) |
233 | goto out_extint; |
234 | rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; |
235 | if (rc) |
236 | goto out_pfault; |
237 | irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); |
238 | cpuhp_setup_state_nocalls(state: CPUHP_S390_PFAULT_DEAD, name: "s390/pfault:dead" , |
239 | NULL, teardown: pfault_cpu_dead); |
240 | return 0; |
241 | |
242 | out_pfault: |
243 | unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); |
244 | out_extint: |
245 | pfault_disable = 1; |
246 | return rc; |
247 | } |
248 | early_initcall(pfault_irq_init); |
249 | |