1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | /* |
3 | * Restartable sequences system call |
4 | * |
5 | * Copyright (C) 2015, Google, Inc., |
6 | * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> |
7 | * Copyright (C) 2015-2018, EfficiOS Inc., |
8 | * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
9 | */ |
10 | |
11 | #include <linux/sched.h> |
12 | #include <linux/uaccess.h> |
13 | #include <linux/syscalls.h> |
14 | #include <linux/rseq.h> |
15 | #include <linux/types.h> |
16 | #include <linux/ratelimit.h> |
17 | #include <asm/ptrace.h> |
18 | |
19 | #define CREATE_TRACE_POINTS |
20 | #include <trace/events/rseq.h> |
21 | |
22 | /* The original rseq structure size (including padding) is 32 bytes. */ |
23 | #define ORIG_RSEQ_SIZE 32 |
24 | |
25 | #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ |
26 | RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ |
27 | RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) |
28 | |
29 | #ifdef CONFIG_DEBUG_RSEQ |
30 | static struct rseq *rseq_kernel_fields(struct task_struct *t) |
31 | { |
32 | return (struct rseq *) t->rseq_fields; |
33 | } |
34 | |
35 | static int rseq_validate_ro_fields(struct task_struct *t) |
36 | { |
37 | static DEFINE_RATELIMIT_STATE(_rs, |
38 | DEFAULT_RATELIMIT_INTERVAL, |
39 | DEFAULT_RATELIMIT_BURST); |
40 | u32 cpu_id_start, cpu_id, node_id, mm_cid; |
41 | struct rseq __user *rseq = t->rseq; |
42 | |
43 | /* |
44 | * Validate fields which are required to be read-only by |
45 | * user-space. |
46 | */ |
47 | if (!user_read_access_begin(rseq, t->rseq_len)) |
48 | goto efault; |
49 | unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); |
50 | unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); |
51 | unsafe_get_user(node_id, &rseq->node_id, efault_end); |
52 | unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); |
53 | user_read_access_end(); |
54 | |
55 | if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || |
56 | cpu_id != rseq_kernel_fields(t)->cpu_id || |
57 | node_id != rseq_kernel_fields(t)->node_id || |
58 | mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { |
59 | |
60 | pr_warn("Detected rseq corruption for pid: %d, name: %s\n" |
61 | "\tcpu_id_start: %u ?= %u\n" |
62 | "\tcpu_id: %u ?= %u\n" |
63 | "\tnode_id: %u ?= %u\n" |
64 | "\tmm_cid: %u ?= %u\n" , |
65 | t->pid, t->comm, |
66 | cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, |
67 | cpu_id, rseq_kernel_fields(t)->cpu_id, |
68 | node_id, rseq_kernel_fields(t)->node_id, |
69 | mm_cid, rseq_kernel_fields(t)->mm_cid); |
70 | } |
71 | |
72 | /* For now, only print a console warning on mismatch. */ |
73 | return 0; |
74 | |
75 | efault_end: |
76 | user_read_access_end(); |
77 | efault: |
78 | return -EFAULT; |
79 | } |
80 | |
81 | /* |
82 | * Update an rseq field and its in-kernel copy in lock-step to keep a coherent |
83 | * state. |
84 | */ |
85 | #define rseq_unsafe_put_user(t, value, field, error_label) \ |
86 | do { \ |
87 | unsafe_put_user(value, &t->rseq->field, error_label); \ |
88 | rseq_kernel_fields(t)->field = value; \ |
89 | } while (0) |
90 | |
91 | #else |
92 | static int rseq_validate_ro_fields(struct task_struct *t) |
93 | { |
94 | return 0; |
95 | } |
96 | |
97 | #define rseq_unsafe_put_user(t, value, field, error_label) \ |
98 | unsafe_put_user(value, &t->rseq->field, error_label) |
99 | #endif |
100 | |
101 | /* |
102 | * |
103 | * Restartable sequences are a lightweight interface that allows |
104 | * user-level code to be executed atomically relative to scheduler |
105 | * preemption and signal delivery. Typically used for implementing |
106 | * per-cpu operations. |
107 | * |
108 | * It allows user-space to perform update operations on per-cpu data |
109 | * without requiring heavy-weight atomic operations. |
110 | * |
111 | * Detailed algorithm of rseq user-space assembly sequences: |
112 | * |
113 | * init(rseq_cs) |
114 | * cpu = TLS->rseq::cpu_id_start |
115 | * [1] TLS->rseq::rseq_cs = rseq_cs |
116 | * [start_ip] ---------------------------- |
117 | * [2] if (cpu != TLS->rseq::cpu_id) |
118 | * goto abort_ip; |
119 | * [3] <last_instruction_in_cs> |
120 | * [post_commit_ip] ---------------------------- |
121 | * |
122 | * The address of jump target abort_ip must be outside the critical |
123 | * region, i.e.: |
124 | * |
125 | * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] |
126 | * |
127 | * Steps [2]-[3] (inclusive) need to be a sequence of instructions in |
128 | * userspace that can handle being interrupted between any of those |
129 | * instructions, and then resumed to the abort_ip. |
130 | * |
131 | * 1. Userspace stores the address of the struct rseq_cs assembly |
132 | * block descriptor into the rseq_cs field of the registered |
133 | * struct rseq TLS area. This update is performed through a single |
134 | * store within the inline assembly instruction sequence. |
135 | * [start_ip] |
136 | * |
137 | * 2. Userspace tests to check whether the current cpu_id field match |
138 | * the cpu number loaded before start_ip, branching to abort_ip |
139 | * in case of a mismatch. |
140 | * |
141 | * If the sequence is preempted or interrupted by a signal |
142 | * at or after start_ip and before post_commit_ip, then the kernel |
143 | * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return |
144 | * ip to abort_ip before returning to user-space, so the preempted |
145 | * execution resumes at abort_ip. |
146 | * |
147 | * 3. Userspace critical section final instruction before |
148 | * post_commit_ip is the commit. The critical section is |
149 | * self-terminating. |
150 | * [post_commit_ip] |
151 | * |
152 | * 4. <success> |
153 | * |
154 | * On failure at [2], or if interrupted by preempt or signal delivery |
155 | * between [1] and [3]: |
156 | * |
157 | * [abort_ip] |
158 | * F1. <failure> |
159 | */ |
160 | |
161 | static int rseq_update_cpu_node_id(struct task_struct *t) |
162 | { |
163 | struct rseq __user *rseq = t->rseq; |
164 | u32 cpu_id = raw_smp_processor_id(); |
165 | u32 node_id = cpu_to_node(cpu: cpu_id); |
166 | u32 mm_cid = task_mm_cid(t); |
167 | |
168 | /* |
169 | * Validate read-only rseq fields. |
170 | */ |
171 | if (rseq_validate_ro_fields(t)) |
172 | goto efault; |
173 | WARN_ON_ONCE((int) mm_cid < 0); |
174 | if (!user_write_access_begin(rseq, t->rseq_len)) |
175 | goto efault; |
176 | |
177 | rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); |
178 | rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); |
179 | rseq_unsafe_put_user(t, node_id, node_id, efault_end); |
180 | rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); |
181 | |
182 | /* |
183 | * Additional feature fields added after ORIG_RSEQ_SIZE |
184 | * need to be conditionally updated only if |
185 | * t->rseq_len != ORIG_RSEQ_SIZE. |
186 | */ |
187 | user_write_access_end(); |
188 | trace_rseq_update(t); |
189 | return 0; |
190 | |
191 | efault_end: |
192 | user_write_access_end(); |
193 | efault: |
194 | return -EFAULT; |
195 | } |
196 | |
197 | static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) |
198 | { |
199 | struct rseq __user *rseq = t->rseq; |
200 | u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, |
201 | mm_cid = 0; |
202 | |
203 | /* |
204 | * Validate read-only rseq fields. |
205 | */ |
206 | if (rseq_validate_ro_fields(t)) |
207 | goto efault; |
208 | |
209 | if (!user_write_access_begin(rseq, t->rseq_len)) |
210 | goto efault; |
211 | |
212 | /* |
213 | * Reset all fields to their initial state. |
214 | * |
215 | * All fields have an initial state of 0 except cpu_id which is set to |
216 | * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after |
217 | * unregistration can figure out that rseq needs to be registered |
218 | * again. |
219 | */ |
220 | rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); |
221 | rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); |
222 | rseq_unsafe_put_user(t, node_id, node_id, efault_end); |
223 | rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); |
224 | |
225 | /* |
226 | * Additional feature fields added after ORIG_RSEQ_SIZE |
227 | * need to be conditionally reset only if |
228 | * t->rseq_len != ORIG_RSEQ_SIZE. |
229 | */ |
230 | user_write_access_end(); |
231 | return 0; |
232 | |
233 | efault_end: |
234 | user_write_access_end(); |
235 | efault: |
236 | return -EFAULT; |
237 | } |
238 | |
239 | /* |
240 | * Get the user-space pointer value stored in the 'rseq_cs' field. |
241 | */ |
242 | static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) |
243 | { |
244 | if (!rseq_cs) |
245 | return -EFAULT; |
246 | |
247 | #ifdef CONFIG_64BIT |
248 | if (get_user(*rseq_cs, &rseq->rseq_cs)) |
249 | return -EFAULT; |
250 | #else |
251 | if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) |
252 | return -EFAULT; |
253 | #endif |
254 | |
255 | return 0; |
256 | } |
257 | |
258 | /* |
259 | * If the rseq_cs field of 'struct rseq' contains a valid pointer to |
260 | * user-space, copy 'struct rseq_cs' from user-space and validate its fields. |
261 | */ |
262 | static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) |
263 | { |
264 | struct rseq_cs __user *urseq_cs; |
265 | u64 ptr; |
266 | u32 __user *usig; |
267 | u32 sig; |
268 | int ret; |
269 | |
270 | ret = rseq_get_rseq_cs_ptr_val(rseq: t->rseq, rseq_cs: &ptr); |
271 | if (ret) |
272 | return ret; |
273 | |
274 | /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ |
275 | if (!ptr) { |
276 | memset(rseq_cs, 0, sizeof(*rseq_cs)); |
277 | return 0; |
278 | } |
279 | /* Check that the pointer value fits in the user-space process space. */ |
280 | if (ptr >= TASK_SIZE) |
281 | return -EINVAL; |
282 | urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; |
283 | if (copy_from_user(to: rseq_cs, from: urseq_cs, n: sizeof(*rseq_cs))) |
284 | return -EFAULT; |
285 | |
286 | if (rseq_cs->start_ip >= TASK_SIZE || |
287 | rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || |
288 | rseq_cs->abort_ip >= TASK_SIZE || |
289 | rseq_cs->version > 0) |
290 | return -EINVAL; |
291 | /* Check for overflow. */ |
292 | if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) |
293 | return -EINVAL; |
294 | /* Ensure that abort_ip is not in the critical section. */ |
295 | if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) |
296 | return -EINVAL; |
297 | |
298 | usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); |
299 | ret = get_user(sig, usig); |
300 | if (ret) |
301 | return ret; |
302 | |
303 | if (current->rseq_sig != sig) { |
304 | printk_ratelimited(KERN_WARNING |
305 | "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n" , |
306 | sig, current->rseq_sig, current->pid, usig); |
307 | return -EINVAL; |
308 | } |
309 | return 0; |
310 | } |
311 | |
312 | static bool rseq_warn_flags(const char *str, u32 flags) |
313 | { |
314 | u32 test_flags; |
315 | |
316 | if (!flags) |
317 | return false; |
318 | test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; |
319 | if (test_flags) |
320 | pr_warn_once("Deprecated flags (%u) in %s ABI structure" , test_flags, str); |
321 | test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; |
322 | if (test_flags) |
323 | pr_warn_once("Unknown flags (%u) in %s ABI structure" , test_flags, str); |
324 | return true; |
325 | } |
326 | |
327 | static int rseq_need_restart(struct task_struct *t, u32 cs_flags) |
328 | { |
329 | u32 flags, event_mask; |
330 | int ret; |
331 | |
332 | if (rseq_warn_flags(str: "rseq_cs" , flags: cs_flags)) |
333 | return -EINVAL; |
334 | |
335 | /* Get thread flags. */ |
336 | ret = get_user(flags, &t->rseq->flags); |
337 | if (ret) |
338 | return ret; |
339 | |
340 | if (rseq_warn_flags(str: "rseq" , flags)) |
341 | return -EINVAL; |
342 | |
343 | /* |
344 | * Load and clear event mask atomically with respect to |
345 | * scheduler preemption. |
346 | */ |
347 | preempt_disable(); |
348 | event_mask = t->rseq_event_mask; |
349 | t->rseq_event_mask = 0; |
350 | preempt_enable(); |
351 | |
352 | return !!event_mask; |
353 | } |
354 | |
355 | static int clear_rseq_cs(struct rseq __user *rseq) |
356 | { |
357 | /* |
358 | * The rseq_cs field is set to NULL on preemption or signal |
359 | * delivery on top of rseq assembly block, as well as on top |
360 | * of code outside of the rseq assembly block. This performs |
361 | * a lazy clear of the rseq_cs field. |
362 | * |
363 | * Set rseq_cs to NULL. |
364 | */ |
365 | #ifdef CONFIG_64BIT |
366 | return put_user(0UL, &rseq->rseq_cs); |
367 | #else |
368 | if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) |
369 | return -EFAULT; |
370 | return 0; |
371 | #endif |
372 | } |
373 | |
374 | /* |
375 | * Unsigned comparison will be true when ip >= start_ip, and when |
376 | * ip < start_ip + post_commit_offset. |
377 | */ |
378 | static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) |
379 | { |
380 | return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; |
381 | } |
382 | |
383 | static int rseq_ip_fixup(struct pt_regs *regs) |
384 | { |
385 | unsigned long ip = instruction_pointer(regs); |
386 | struct task_struct *t = current; |
387 | struct rseq_cs rseq_cs; |
388 | int ret; |
389 | |
390 | ret = rseq_get_rseq_cs(t, rseq_cs: &rseq_cs); |
391 | if (ret) |
392 | return ret; |
393 | |
394 | /* |
395 | * Handle potentially not being within a critical section. |
396 | * If not nested over a rseq critical section, restart is useless. |
397 | * Clear the rseq_cs pointer and return. |
398 | */ |
399 | if (!in_rseq_cs(ip, rseq_cs: &rseq_cs)) |
400 | return clear_rseq_cs(rseq: t->rseq); |
401 | ret = rseq_need_restart(t, cs_flags: rseq_cs.flags); |
402 | if (ret <= 0) |
403 | return ret; |
404 | ret = clear_rseq_cs(rseq: t->rseq); |
405 | if (ret) |
406 | return ret; |
407 | trace_rseq_ip_fixup(regs_ip: ip, start_ip: rseq_cs.start_ip, post_commit_offset: rseq_cs.post_commit_offset, |
408 | abort_ip: rseq_cs.abort_ip); |
409 | instruction_pointer_set(regs, val: (unsigned long)rseq_cs.abort_ip); |
410 | return 0; |
411 | } |
412 | |
413 | /* |
414 | * This resume handler must always be executed between any of: |
415 | * - preemption, |
416 | * - signal delivery, |
417 | * and return to user-space. |
418 | * |
419 | * This is how we can ensure that the entire rseq critical section |
420 | * will issue the commit instruction only if executed atomically with |
421 | * respect to other threads scheduled on the same CPU, and with respect |
422 | * to signal handlers. |
423 | */ |
424 | void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) |
425 | { |
426 | struct task_struct *t = current; |
427 | int ret, sig; |
428 | |
429 | if (unlikely(t->flags & PF_EXITING)) |
430 | return; |
431 | |
432 | /* |
433 | * regs is NULL if and only if the caller is in a syscall path. Skip |
434 | * fixup and leave rseq_cs as is so that rseq_sycall() will detect and |
435 | * kill a misbehaving userspace on debug kernels. |
436 | */ |
437 | if (regs) { |
438 | ret = rseq_ip_fixup(regs); |
439 | if (unlikely(ret < 0)) |
440 | goto error; |
441 | } |
442 | if (unlikely(rseq_update_cpu_node_id(t))) |
443 | goto error; |
444 | return; |
445 | |
446 | error: |
447 | sig = ksig ? ksig->sig : 0; |
448 | force_sigsegv(sig); |
449 | } |
450 | |
451 | #ifdef CONFIG_DEBUG_RSEQ |
452 | |
453 | /* |
454 | * Terminate the process if a syscall is issued within a restartable |
455 | * sequence. |
456 | */ |
457 | void rseq_syscall(struct pt_regs *regs) |
458 | { |
459 | unsigned long ip = instruction_pointer(regs); |
460 | struct task_struct *t = current; |
461 | struct rseq_cs rseq_cs; |
462 | |
463 | if (!t->rseq) |
464 | return; |
465 | if (rseq_get_rseq_cs(t, rseq_cs: &rseq_cs) || in_rseq_cs(ip, rseq_cs: &rseq_cs)) |
466 | force_sig(SIGSEGV); |
467 | } |
468 | |
469 | #endif |
470 | |
471 | /* |
472 | * sys_rseq - setup restartable sequences for caller thread. |
473 | */ |
474 | SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, |
475 | int, flags, u32, sig) |
476 | { |
477 | int ret; |
478 | u64 rseq_cs; |
479 | |
480 | if (flags & RSEQ_FLAG_UNREGISTER) { |
481 | if (flags & ~RSEQ_FLAG_UNREGISTER) |
482 | return -EINVAL; |
483 | /* Unregister rseq for current thread. */ |
484 | if (current->rseq != rseq || !current->rseq) |
485 | return -EINVAL; |
486 | if (rseq_len != current->rseq_len) |
487 | return -EINVAL; |
488 | if (current->rseq_sig != sig) |
489 | return -EPERM; |
490 | ret = rseq_reset_rseq_cpu_node_id(current); |
491 | if (ret) |
492 | return ret; |
493 | current->rseq = NULL; |
494 | current->rseq_sig = 0; |
495 | current->rseq_len = 0; |
496 | return 0; |
497 | } |
498 | |
499 | if (unlikely(flags)) |
500 | return -EINVAL; |
501 | |
502 | if (current->rseq) { |
503 | /* |
504 | * If rseq is already registered, check whether |
505 | * the provided address differs from the prior |
506 | * one. |
507 | */ |
508 | if (current->rseq != rseq || rseq_len != current->rseq_len) |
509 | return -EINVAL; |
510 | if (current->rseq_sig != sig) |
511 | return -EPERM; |
512 | /* Already registered. */ |
513 | return -EBUSY; |
514 | } |
515 | |
516 | /* |
517 | * If there was no rseq previously registered, ensure the provided rseq |
518 | * is properly aligned, as communcated to user-space through the ELF |
519 | * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq |
520 | * size, the required alignment is the original struct rseq alignment. |
521 | * |
522 | * In order to be valid, rseq_len is either the original rseq size, or |
523 | * large enough to contain all supported fields, as communicated to |
524 | * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. |
525 | */ |
526 | if (rseq_len < ORIG_RSEQ_SIZE || |
527 | (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || |
528 | (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || |
529 | rseq_len < offsetof(struct rseq, end)))) |
530 | return -EINVAL; |
531 | if (!access_ok(rseq, rseq_len)) |
532 | return -EFAULT; |
533 | |
534 | /* |
535 | * If the rseq_cs pointer is non-NULL on registration, clear it to |
536 | * avoid a potential segfault on return to user-space. The proper thing |
537 | * to do would have been to fail the registration but this would break |
538 | * older libcs that reuse the rseq area for new threads without |
539 | * clearing the fields. |
540 | */ |
541 | if (rseq_get_rseq_cs_ptr_val(rseq, rseq_cs: &rseq_cs)) |
542 | return -EFAULT; |
543 | if (rseq_cs && clear_rseq_cs(rseq)) |
544 | return -EFAULT; |
545 | |
546 | #ifdef CONFIG_DEBUG_RSEQ |
547 | /* |
548 | * Initialize the in-kernel rseq fields copy for validation of |
549 | * read-only fields. |
550 | */ |
551 | if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || |
552 | get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || |
553 | get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || |
554 | get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) |
555 | return -EFAULT; |
556 | #endif |
557 | /* |
558 | * Activate the registration by setting the rseq area address, length |
559 | * and signature in the task struct. |
560 | */ |
561 | current->rseq = rseq; |
562 | current->rseq_len = rseq_len; |
563 | current->rseq_sig = sig; |
564 | |
565 | /* |
566 | * If rseq was previously inactive, and has just been |
567 | * registered, ensure the cpu_id_start and cpu_id fields |
568 | * are updated before returning to user-space. |
569 | */ |
570 | rseq_set_notify_resume(current); |
571 | |
572 | return 0; |
573 | } |
574 | |