1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Author: Andrei Vagin <avagin@openvz.org> |
4 | * Author: Dmitry Safonov <dima@arista.com> |
5 | */ |
6 | |
7 | #include <linux/time_namespace.h> |
8 | #include <linux/user_namespace.h> |
9 | #include <linux/sched/signal.h> |
10 | #include <linux/sched/task.h> |
11 | #include <linux/clocksource.h> |
12 | #include <linux/seq_file.h> |
13 | #include <linux/proc_ns.h> |
14 | #include <linux/export.h> |
15 | #include <linux/time.h> |
16 | #include <linux/slab.h> |
17 | #include <linux/cred.h> |
18 | #include <linux/err.h> |
19 | #include <linux/mm.h> |
20 | |
21 | #include <vdso/datapage.h> |
22 | |
23 | ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, |
24 | struct timens_offsets *ns_offsets) |
25 | { |
26 | ktime_t offset; |
27 | |
28 | switch (clockid) { |
29 | case CLOCK_MONOTONIC: |
30 | offset = timespec64_to_ktime(ts: ns_offsets->monotonic); |
31 | break; |
32 | case CLOCK_BOOTTIME: |
33 | case CLOCK_BOOTTIME_ALARM: |
34 | offset = timespec64_to_ktime(ts: ns_offsets->boottime); |
35 | break; |
36 | default: |
37 | return tim; |
38 | } |
39 | |
40 | /* |
41 | * Check that @tim value is in [offset, KTIME_MAX + offset] |
42 | * and subtract offset. |
43 | */ |
44 | if (tim < offset) { |
45 | /* |
46 | * User can specify @tim *absolute* value - if it's lesser than |
47 | * the time namespace's offset - it's already expired. |
48 | */ |
49 | tim = 0; |
50 | } else { |
51 | tim = ktime_sub(tim, offset); |
52 | if (unlikely(tim > KTIME_MAX)) |
53 | tim = KTIME_MAX; |
54 | } |
55 | |
56 | return tim; |
57 | } |
58 | |
59 | static struct ucounts *inc_time_namespaces(struct user_namespace *ns) |
60 | { |
61 | return inc_ucount(ns, current_euid(), type: UCOUNT_TIME_NAMESPACES); |
62 | } |
63 | |
64 | static void dec_time_namespaces(struct ucounts *ucounts) |
65 | { |
66 | dec_ucount(ucounts, type: UCOUNT_TIME_NAMESPACES); |
67 | } |
68 | |
69 | /** |
70 | * clone_time_ns - Clone a time namespace |
71 | * @user_ns: User namespace which owns a new namespace. |
72 | * @old_ns: Namespace to clone |
73 | * |
74 | * Clone @old_ns and set the clone refcount to 1 |
75 | * |
76 | * Return: The new namespace or ERR_PTR. |
77 | */ |
78 | static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, |
79 | struct time_namespace *old_ns) |
80 | { |
81 | struct time_namespace *ns; |
82 | struct ucounts *ucounts; |
83 | int err; |
84 | |
85 | err = -ENOSPC; |
86 | ucounts = inc_time_namespaces(ns: user_ns); |
87 | if (!ucounts) |
88 | goto fail; |
89 | |
90 | err = -ENOMEM; |
91 | ns = kmalloc(size: sizeof(*ns), GFP_KERNEL_ACCOUNT); |
92 | if (!ns) |
93 | goto fail_dec; |
94 | |
95 | refcount_set(r: &ns->ns.count, n: 1); |
96 | |
97 | ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
98 | if (!ns->vvar_page) |
99 | goto fail_free; |
100 | |
101 | err = ns_alloc_inum(ns: &ns->ns); |
102 | if (err) |
103 | goto fail_free_page; |
104 | |
105 | ns->ucounts = ucounts; |
106 | ns->ns.ops = &timens_operations; |
107 | ns->user_ns = get_user_ns(ns: user_ns); |
108 | ns->offsets = old_ns->offsets; |
109 | ns->frozen_offsets = false; |
110 | return ns; |
111 | |
112 | fail_free_page: |
113 | __free_page(ns->vvar_page); |
114 | fail_free: |
115 | kfree(objp: ns); |
116 | fail_dec: |
117 | dec_time_namespaces(ucounts); |
118 | fail: |
119 | return ERR_PTR(error: err); |
120 | } |
121 | |
122 | /** |
123 | * copy_time_ns - Create timens_for_children from @old_ns |
124 | * @flags: Cloning flags |
125 | * @user_ns: User namespace which owns a new namespace. |
126 | * @old_ns: Namespace to clone |
127 | * |
128 | * If CLONE_NEWTIME specified in @flags, creates a new timens_for_children; |
129 | * adds a refcounter to @old_ns otherwise. |
130 | * |
131 | * Return: timens_for_children namespace or ERR_PTR. |
132 | */ |
133 | struct time_namespace *copy_time_ns(unsigned long flags, |
134 | struct user_namespace *user_ns, struct time_namespace *old_ns) |
135 | { |
136 | if (!(flags & CLONE_NEWTIME)) |
137 | return get_time_ns(ns: old_ns); |
138 | |
139 | return clone_time_ns(user_ns, old_ns); |
140 | } |
141 | |
142 | static struct timens_offset offset_from_ts(struct timespec64 off) |
143 | { |
144 | struct timens_offset ret; |
145 | |
146 | ret.sec = off.tv_sec; |
147 | ret.nsec = off.tv_nsec; |
148 | |
149 | return ret; |
150 | } |
151 | |
152 | /* |
153 | * A time namespace VVAR page has the same layout as the VVAR page which |
154 | * contains the system wide VDSO data. |
155 | * |
156 | * For a normal task the VVAR pages are installed in the normal ordering: |
157 | * VVAR |
158 | * PVCLOCK |
159 | * HVCLOCK |
160 | * TIMENS <- Not really required |
161 | * |
162 | * Now for a timens task the pages are installed in the following order: |
163 | * TIMENS |
164 | * PVCLOCK |
165 | * HVCLOCK |
166 | * VVAR |
167 | * |
168 | * The check for vdso_data->clock_mode is in the unlikely path of |
169 | * the seq begin magic. So for the non-timens case most of the time |
170 | * 'seq' is even, so the branch is not taken. |
171 | * |
172 | * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check |
173 | * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the |
174 | * update to finish and for 'seq' to become even anyway. |
175 | * |
176 | * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which |
177 | * enforces the time namespace handling path. |
178 | */ |
179 | static void timens_setup_vdso_data(struct vdso_data *vdata, |
180 | struct time_namespace *ns) |
181 | { |
182 | struct timens_offset *offset = vdata->offset; |
183 | struct timens_offset monotonic = offset_from_ts(off: ns->offsets.monotonic); |
184 | struct timens_offset boottime = offset_from_ts(off: ns->offsets.boottime); |
185 | |
186 | vdata->seq = 1; |
187 | vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; |
188 | offset[CLOCK_MONOTONIC] = monotonic; |
189 | offset[CLOCK_MONOTONIC_RAW] = monotonic; |
190 | offset[CLOCK_MONOTONIC_COARSE] = monotonic; |
191 | offset[CLOCK_BOOTTIME] = boottime; |
192 | offset[CLOCK_BOOTTIME_ALARM] = boottime; |
193 | } |
194 | |
195 | struct page *find_timens_vvar_page(struct vm_area_struct *vma) |
196 | { |
197 | if (likely(vma->vm_mm == current->mm)) |
198 | return current->nsproxy->time_ns->vvar_page; |
199 | |
200 | /* |
201 | * VM_PFNMAP | VM_IO protect .fault() handler from being called |
202 | * through interfaces like /proc/$pid/mem or |
203 | * process_vm_{readv,writev}() as long as there's no .access() |
204 | * in special_mapping_vmops(). |
205 | * For more details check_vma_flags() and __access_remote_vm() |
206 | */ |
207 | |
208 | WARN(1, "vvar_page accessed remotely" ); |
209 | |
210 | return NULL; |
211 | } |
212 | |
213 | /* |
214 | * Protects possibly multiple offsets writers racing each other |
215 | * and tasks entering the namespace. |
216 | */ |
217 | static DEFINE_MUTEX(offset_lock); |
218 | |
219 | static void timens_set_vvar_page(struct task_struct *task, |
220 | struct time_namespace *ns) |
221 | { |
222 | struct vdso_data *vdata; |
223 | unsigned int i; |
224 | |
225 | if (ns == &init_time_ns) |
226 | return; |
227 | |
228 | /* Fast-path, taken by every task in namespace except the first. */ |
229 | if (likely(ns->frozen_offsets)) |
230 | return; |
231 | |
232 | mutex_lock(&offset_lock); |
233 | /* Nothing to-do: vvar_page has been already initialized. */ |
234 | if (ns->frozen_offsets) |
235 | goto out; |
236 | |
237 | ns->frozen_offsets = true; |
238 | vdata = arch_get_vdso_data(page_address(ns->vvar_page)); |
239 | |
240 | for (i = 0; i < CS_BASES; i++) |
241 | timens_setup_vdso_data(vdata: &vdata[i], ns); |
242 | |
243 | out: |
244 | mutex_unlock(lock: &offset_lock); |
245 | } |
246 | |
247 | void free_time_ns(struct time_namespace *ns) |
248 | { |
249 | dec_time_namespaces(ucounts: ns->ucounts); |
250 | put_user_ns(ns: ns->user_ns); |
251 | ns_free_inum(&ns->ns); |
252 | __free_page(ns->vvar_page); |
253 | kfree(objp: ns); |
254 | } |
255 | |
256 | static struct time_namespace *to_time_ns(struct ns_common *ns) |
257 | { |
258 | return container_of(ns, struct time_namespace, ns); |
259 | } |
260 | |
261 | static struct ns_common *timens_get(struct task_struct *task) |
262 | { |
263 | struct time_namespace *ns = NULL; |
264 | struct nsproxy *nsproxy; |
265 | |
266 | task_lock(p: task); |
267 | nsproxy = task->nsproxy; |
268 | if (nsproxy) { |
269 | ns = nsproxy->time_ns; |
270 | get_time_ns(ns); |
271 | } |
272 | task_unlock(p: task); |
273 | |
274 | return ns ? &ns->ns : NULL; |
275 | } |
276 | |
277 | static struct ns_common *timens_for_children_get(struct task_struct *task) |
278 | { |
279 | struct time_namespace *ns = NULL; |
280 | struct nsproxy *nsproxy; |
281 | |
282 | task_lock(p: task); |
283 | nsproxy = task->nsproxy; |
284 | if (nsproxy) { |
285 | ns = nsproxy->time_ns_for_children; |
286 | get_time_ns(ns); |
287 | } |
288 | task_unlock(p: task); |
289 | |
290 | return ns ? &ns->ns : NULL; |
291 | } |
292 | |
293 | static void timens_put(struct ns_common *ns) |
294 | { |
295 | put_time_ns(ns: to_time_ns(ns)); |
296 | } |
297 | |
298 | void timens_commit(struct task_struct *tsk, struct time_namespace *ns) |
299 | { |
300 | timens_set_vvar_page(task: tsk, ns); |
301 | vdso_join_timens(task: tsk, ns); |
302 | } |
303 | |
304 | static int timens_install(struct nsset *nsset, struct ns_common *new) |
305 | { |
306 | struct nsproxy *nsproxy = nsset->nsproxy; |
307 | struct time_namespace *ns = to_time_ns(ns: new); |
308 | |
309 | if (!current_is_single_threaded()) |
310 | return -EUSERS; |
311 | |
312 | if (!ns_capable(ns: ns->user_ns, CAP_SYS_ADMIN) || |
313 | !ns_capable(ns: nsset->cred->user_ns, CAP_SYS_ADMIN)) |
314 | return -EPERM; |
315 | |
316 | get_time_ns(ns); |
317 | put_time_ns(ns: nsproxy->time_ns); |
318 | nsproxy->time_ns = ns; |
319 | |
320 | get_time_ns(ns); |
321 | put_time_ns(ns: nsproxy->time_ns_for_children); |
322 | nsproxy->time_ns_for_children = ns; |
323 | return 0; |
324 | } |
325 | |
326 | void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) |
327 | { |
328 | struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; |
329 | struct time_namespace *ns = to_time_ns(ns: nsc); |
330 | |
331 | /* create_new_namespaces() already incremented the ref counter */ |
332 | if (nsproxy->time_ns == nsproxy->time_ns_for_children) |
333 | return; |
334 | |
335 | get_time_ns(ns); |
336 | put_time_ns(ns: nsproxy->time_ns); |
337 | nsproxy->time_ns = ns; |
338 | |
339 | timens_commit(tsk, ns); |
340 | } |
341 | |
342 | static struct user_namespace *timens_owner(struct ns_common *ns) |
343 | { |
344 | return to_time_ns(ns)->user_ns; |
345 | } |
346 | |
347 | static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) |
348 | { |
349 | char *clock; |
350 | |
351 | switch (clockid) { |
352 | case CLOCK_BOOTTIME: |
353 | clock = "boottime" ; |
354 | break; |
355 | case CLOCK_MONOTONIC: |
356 | clock = "monotonic" ; |
357 | break; |
358 | default: |
359 | clock = "unknown" ; |
360 | break; |
361 | } |
362 | seq_printf(m, fmt: "%-10s %10lld %9ld\n" , clock, ts->tv_sec, ts->tv_nsec); |
363 | } |
364 | |
365 | void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) |
366 | { |
367 | struct ns_common *ns; |
368 | struct time_namespace *time_ns; |
369 | |
370 | ns = timens_for_children_get(task: p); |
371 | if (!ns) |
372 | return; |
373 | time_ns = to_time_ns(ns); |
374 | |
375 | show_offset(m, CLOCK_MONOTONIC, ts: &time_ns->offsets.monotonic); |
376 | show_offset(m, CLOCK_BOOTTIME, ts: &time_ns->offsets.boottime); |
377 | put_time_ns(ns: time_ns); |
378 | } |
379 | |
380 | int proc_timens_set_offset(struct file *file, struct task_struct *p, |
381 | struct proc_timens_offset *offsets, int noffsets) |
382 | { |
383 | struct ns_common *ns; |
384 | struct time_namespace *time_ns; |
385 | struct timespec64 tp; |
386 | int i, err; |
387 | |
388 | ns = timens_for_children_get(task: p); |
389 | if (!ns) |
390 | return -ESRCH; |
391 | time_ns = to_time_ns(ns); |
392 | |
393 | if (!file_ns_capable(file, ns: time_ns->user_ns, CAP_SYS_TIME)) { |
394 | put_time_ns(ns: time_ns); |
395 | return -EPERM; |
396 | } |
397 | |
398 | for (i = 0; i < noffsets; i++) { |
399 | struct proc_timens_offset *off = &offsets[i]; |
400 | |
401 | switch (off->clockid) { |
402 | case CLOCK_MONOTONIC: |
403 | ktime_get_ts64(ts: &tp); |
404 | break; |
405 | case CLOCK_BOOTTIME: |
406 | ktime_get_boottime_ts64(ts: &tp); |
407 | break; |
408 | default: |
409 | err = -EINVAL; |
410 | goto out; |
411 | } |
412 | |
413 | err = -ERANGE; |
414 | |
415 | if (off->val.tv_sec > KTIME_SEC_MAX || |
416 | off->val.tv_sec < -KTIME_SEC_MAX) |
417 | goto out; |
418 | |
419 | tp = timespec64_add(lhs: tp, rhs: off->val); |
420 | /* |
421 | * KTIME_SEC_MAX is divided by 2 to be sure that KTIME_MAX is |
422 | * still unreachable. |
423 | */ |
424 | if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) |
425 | goto out; |
426 | } |
427 | |
428 | mutex_lock(&offset_lock); |
429 | if (time_ns->frozen_offsets) { |
430 | err = -EACCES; |
431 | goto out_unlock; |
432 | } |
433 | |
434 | err = 0; |
435 | /* Don't report errors after this line */ |
436 | for (i = 0; i < noffsets; i++) { |
437 | struct proc_timens_offset *off = &offsets[i]; |
438 | struct timespec64 *offset = NULL; |
439 | |
440 | switch (off->clockid) { |
441 | case CLOCK_MONOTONIC: |
442 | offset = &time_ns->offsets.monotonic; |
443 | break; |
444 | case CLOCK_BOOTTIME: |
445 | offset = &time_ns->offsets.boottime; |
446 | break; |
447 | } |
448 | |
449 | *offset = off->val; |
450 | } |
451 | |
452 | out_unlock: |
453 | mutex_unlock(lock: &offset_lock); |
454 | out: |
455 | put_time_ns(ns: time_ns); |
456 | |
457 | return err; |
458 | } |
459 | |
460 | const struct proc_ns_operations timens_operations = { |
461 | .name = "time" , |
462 | .type = CLONE_NEWTIME, |
463 | .get = timens_get, |
464 | .put = timens_put, |
465 | .install = timens_install, |
466 | .owner = timens_owner, |
467 | }; |
468 | |
469 | const struct proc_ns_operations timens_for_children_operations = { |
470 | .name = "time_for_children" , |
471 | .real_ns_name = "time" , |
472 | .type = CLONE_NEWTIME, |
473 | .get = timens_for_children_get, |
474 | .put = timens_put, |
475 | .install = timens_install, |
476 | .owner = timens_owner, |
477 | }; |
478 | |
479 | struct time_namespace init_time_ns = { |
480 | .ns.count = REFCOUNT_INIT(3), |
481 | .user_ns = &init_user_ns, |
482 | .ns.inum = PROC_TIME_INIT_INO, |
483 | .ns.ops = &timens_operations, |
484 | .frozen_offsets = true, |
485 | }; |
486 | |