pid.c source code [linux/kernel/pid.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Generic pidhash and scalable, time-bounded PID allocator
4	*
5	* (C) 2002-2003 Nadia Yvette Chambers, IBM
6	* (C) 2004 Nadia Yvette Chambers, Oracle
7	* (C) 2002-2004 Ingo Molnar, Red Hat
8	*
9	* pid-structures are backing objects for tasks sharing a given ID to chain
10	* against. There is very little to them aside from hashing them and
11	* parking tasks using given ID's on a list.
12	*
13	* The hash is always changed with the tasklist_lock write-acquired,
14	* and the hash is only accessed with the tasklist_lock at least
15	* read-acquired, so there's no additional SMP locking needed here.
16	*
17	* We have a list of bitmap pages, which bitmaps represent the PID space.
18	* Allocating and freeing PIDs is completely lockless. The worst-case
19	* allocation scenario when all but one out of 1 million PIDs possible are
20	* allocated already: the scanning of 32 list entries and at most PAGE_SIZE
21	* bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
22	*
23	* Pid namespaces:
24	* (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
25	* (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
26	* Many thanks to Oleg Nesterov for comments and help
27	*
28	*/
29
30	#include <linux/mm.h>
31	#include <linux/export.h>
32	#include <linux/slab.h>
33	#include <linux/init.h>
34	#include <linux/rculist.h>
35	#include <linux/memblock.h>
36	#include <linux/pid_namespace.h>
37	#include <linux/init_task.h>
38	#include <linux/syscalls.h>
39	#include <linux/proc_ns.h>
40	#include <linux/refcount.h>
41	#include <linux/anon_inodes.h>
42	#include <linux/sched/signal.h>
43	#include <linux/sched/task.h>
44	#include <linux/idr.h>
45	#include <linux/pidfs.h>
46	#include <linux/seqlock.h>
47	#include <net/sock.h>
48	#include <uapi/linux/pidfd.h>
49
50	struct pid init_struct_pid = {
51	.count = REFCOUNT_INIT(`1`),
52	.tasks = {
53	{ .first = NULL },
54	{ .first = NULL },
55	{ .first = NULL },
56	},
57	.level = `0`,
58	.numbers = { {
59	.nr = `0`,
60	.ns = &init_pid_ns,
61	}, }
62	};
63
64	static int pid_max_min = RESERVED_PIDS + `1`;
65	static int pid_max_max = PID_MAX_LIMIT;
66
67	/*
68	* PID-map pages start out as NULL, they get allocated upon
69	* first use and are never deallocated. This way a low pid_max
70	* value does not cause lots of bitmaps to be allocated, but
71	* the scheme scales to up to 4 million PIDs, runtime.
72	*/
73	struct pid_namespace init_pid_ns = {
74	.ns.count = REFCOUNT_INIT(`2`),
75	.idr = IDR_INIT(init_pid_ns.idr),
76	.pid_allocated = PIDNS_ADDING,
77	.level = `0`,
78	.child_reaper = &init_task,
79	.user_ns = &init_user_ns,
80	.ns.inum = PROC_PID_INIT_INO,
81	#ifdef CONFIG_PID_NS
82	.ns.ops = &pidns_operations,
83	#endif
84	.pid_max = PID_MAX_DEFAULT,
85	#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
86	.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
87	#endif
88	};
89	EXPORT_SYMBOL_GPL(init_pid_ns);
90
91	static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
92	seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
93
94	void put_pid(struct pid *pid)
95	{
96	struct pid_namespace *ns;
97
98	if (!pid)
99	return;
100
101	ns = pid->numbers[pid->level].ns;
102	if (refcount_dec_and_test(r: &pid->count)) {
103	WARN_ON_ONCE(pid->stashed);
104	kmem_cache_free(s: ns->pid_cachep, objp: pid);
105	put_pid_ns(ns);
106	}
107	}
108	EXPORT_SYMBOL_GPL(put_pid);
109
110	static void delayed_put_pid(struct rcu_head *rhp)
111	{
112	struct pid pid = container_of(rhp, struct* pid, rcu);
113	put_pid(pid);
114	}
115
116	void free_pid(struct pid *pid)
117	{
118	int i;
119
120	lockdep_assert_not_held(&tasklist_lock);
121
122	spin_lock(lock: &pidmap_lock);
123	for (i = `0`; i <= pid->level; i++) {
124	struct upid *upid = pid->numbers + i;
125	struct pid_namespace *ns = upid->ns;
126	switch (--ns->pid_allocated) {
127	case `2`:
128	case `1`:
129	/ When all that is left in the pid namespace*
130	* is the reaper wake up the reaper. The reaper
131	* may be sleeping in zap_pid_ns_processes().
132	*/
133	wake_up_process(tsk: ns->child_reaper);
134	break;
135	case PIDNS_ADDING:
136	/ Handle a fork failure of the first process /
137	WARN_ON(ns->child_reaper);
138	ns->pid_allocated = `0`;
139	break;
140	}
141
142	idr_remove(&ns->idr, id: upid->nr);
143	}
144	pidfs_remove_pid(pid);
145	spin_unlock(lock: &pidmap_lock);
146
147	call_rcu(head: &pid->rcu, func: delayed_put_pid);
148	}
149
150	void free_pids(struct pid **pids)
151	{
152	int tmp;
153
154	/*
155	* This can batch pidmap_lock.
156	*/
157	for (tmp = PIDTYPE_MAX; --tmp >= `0`; )
158	if (pids[tmp])
159	free_pid(pid: pids[tmp]);
160	}
161
162	struct pid alloc_pid(struct* pid_namespace ns, pid_t set_tid,
163	size_t set_tid_size)
164	{
165	struct pid *pid;
166	enum pid_type type;
167	int i, nr;
168	struct pid_namespace *tmp;
169	struct upid *upid;
170	int retval = -ENOMEM;
171
172	/*
173	* set_tid_size contains the size of the set_tid array. Starting at
174	* the most nested currently active PID namespace it tells alloc_pid()
175	* which PID to set for a process in that most nested PID namespace
176	* up to set_tid_size PID namespaces. It does not have to set the PID
177	* for a process in all nested PID namespaces but set_tid_size must
178	* never be greater than the current ns->level + 1.
179	*/
180	if (set_tid_size > ns->level + `1`)
181	return ERR_PTR(error: -EINVAL);
182
183	pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
184	if (!pid)
185	return ERR_PTR(error: retval);
186
187	tmp = ns;
188	pid->level = ns->level;
189
190	for (i = ns->level; i >= `0`; i--) {
191	int tid = `0`;
192	int pid_max = READ_ONCE(tmp->pid_max);
193
194	if (set_tid_size) {
195	tid = set_tid[ns->level - i];
196
197	retval = -EINVAL;
198	if (tid < `1` \|\| tid >= pid_max)
199	goto out_free;
200	/*
201	* Also fail if a PID != 1 is requested and
202	* no PID 1 exists.
203	*/
204	if (tid != `1` && !tmp->child_reaper)
205	goto out_free;
206	retval = -EPERM;
207	if (!checkpoint_restore_ns_capable(ns: tmp->user_ns))
208	goto out_free;
209	set_tid_size--;
210	}
211
212	idr_preload(GFP_KERNEL);
213	spin_lock(lock: &pidmap_lock);
214
215	if (tid) {
216	nr = idr_alloc(&tmp->idr, NULL, start: tid,
217	end: tid + `1`, GFP_ATOMIC);
218	/*
219	* If ENOSPC is returned it means that the PID is
220	* alreay in use. Return EEXIST in that case.
221	*/
222	if (nr == -ENOSPC)
223	nr = -EEXIST;
224	} else {
225	int pid_min = `1`;
226	/*
227	* init really needs pid 1, but after reaching the
228	* maximum wrap back to RESERVED_PIDS
229	*/
230	if (idr_get_cursor(idr: &tmp->idr) > RESERVED_PIDS)
231	pid_min = RESERVED_PIDS;
232
233	/*
234	* Store a null pointer so find_pid_ns does not find
235	* a partially initialized PID (see below).
236	*/
237	nr = idr_alloc_cyclic(&tmp->idr, NULL, start: pid_min,
238	end: pid_max, GFP_ATOMIC);
239	}
240	spin_unlock(lock: &pidmap_lock);
241	idr_preload_end();
242
243	if (nr < `0`) {
244	retval = (nr == -ENOSPC) ? -EAGAIN : nr;
245	goto out_free;
246	}
247
248	pid->numbers[i].nr = nr;
249	pid->numbers[i].ns = tmp;
250	tmp = tmp->parent;
251	}
252
253	/*
254	* ENOMEM is not the most obvious choice especially for the case
255	* where the child subreaper has already exited and the pid
256	* namespace denies the creation of any new processes. But ENOMEM
257	* is what we have exposed to userspace for a long time and it is
258	* documented behavior for pid namespaces. So we can't easily
259	* change it even if there were an error code better suited.
260	*/
261	retval = -ENOMEM;
262
263	get_pid_ns(ns);
264	refcount_set(r: &pid->count, n: `1`);
265	spin_lock_init(&pid->lock);
266	for (type = `0`; type < PIDTYPE_MAX; ++type)
267	INIT_HLIST_HEAD(&pid->tasks[type]);
268
269	init_waitqueue_head(&pid->wait_pidfd);
270	INIT_HLIST_HEAD(&pid->inodes);
271
272	upid = pid->numbers + ns->level;
273	idr_preload(GFP_KERNEL);
274	spin_lock(lock: &pidmap_lock);
275	if (!(ns->pid_allocated & PIDNS_ADDING))
276	goto out_unlock;
277	pidfs_add_pid(pid);
278	for ( ; upid >= pid->numbers; --upid) {
279	/ Make the PID visible to find_pid_ns. /
280	idr_replace(&upid->ns->idr, pid, id: upid->nr);
281	upid->ns->pid_allocated++;
282	}
283	spin_unlock(lock: &pidmap_lock);
284	idr_preload_end();
285
286	return pid;
287
288	out_unlock:
289	spin_unlock(lock: &pidmap_lock);
290	idr_preload_end();
291	put_pid_ns(ns);
292
293	out_free:
294	spin_lock(lock: &pidmap_lock);
295	while (++i <= ns->level) {
296	upid = pid->numbers + i;
297	idr_remove(&upid->ns->idr, id: upid->nr);
298	}
299
300	/ On failure to allocate the first pid, reset the state /
301	if (ns->pid_allocated == PIDNS_ADDING)
302	idr_set_cursor(idr: &ns->idr, val: `0`);
303
304	spin_unlock(lock: &pidmap_lock);
305
306	kmem_cache_free(s: ns->pid_cachep, objp: pid);
307	return ERR_PTR(error: retval);
308	}
309
310	void disable_pid_allocation(struct pid_namespace *ns)
311	{
312	spin_lock(lock: &pidmap_lock);
313	ns->pid_allocated &= ~PIDNS_ADDING;
314	spin_unlock(lock: &pidmap_lock);
315	}
316
317	struct pid find_pid_ns(int* nr, struct pid_namespace *ns)
318	{
319	return idr_find(&ns->idr, id: nr);
320	}
321	EXPORT_SYMBOL_GPL(find_pid_ns);
322
323	struct pid find_vpid(int* nr)
324	{
325	return find_pid_ns(nr, task_active_pid_ns(current));
326	}
327	EXPORT_SYMBOL_GPL(find_vpid);
328
329	static struct pid task_pid_ptr(struct** task_struct task, enum* pid_type type)
330	{
331	return (type == PIDTYPE_PID) ?
332	&task->thread_pid :
333	&task->signal->pids[type];
334	}
335
336	/*
337	* attach_pid() must be called with the tasklist_lock write-held.
338	*/
339	void attach_pid(struct task_struct task, enum* pid_type type)
340	{
341	struct pid *pid;
342
343	lockdep_assert_held_write(&tasklist_lock);
344
345	pid = *task_pid_ptr(task, type);
346	hlist_add_head_rcu(n: &task->pid_links[type], h: &pid->tasks[type]);
347	}
348
349	static void __change_pid(struct pid pids, struct** task_struct *task,
350	enum pid_type type, struct pid *new)
351	{
352	struct pid *pid_ptr, pid;
353	int tmp;
354
355	lockdep_assert_held_write(&tasklist_lock);
356
357	pid_ptr = task_pid_ptr(task, type);
358	pid = *pid_ptr;
359
360	hlist_del_rcu(n: &task->pid_links[type]);
361	*pid_ptr = new;
362
363	for (tmp = PIDTYPE_MAX; --tmp >= `0`; )
364	if (pid_has_task(pid, type: tmp))
365	return;
366
367	WARN_ON(pids[type]);
368	pids[type] = pid;
369	}
370
371	void detach_pid(struct pid pids, struct** task_struct task, enum* pid_type type)
372	{
373	__change_pid(pids, task, type, NULL);
374	}
375
376	void change_pid(struct pid pids, struct** task_struct task, enum* pid_type type,
377	struct pid *pid)
378	{
379	__change_pid(pids, task, type, new: pid);
380	attach_pid(task, type);
381	}
382
383	void exchange_tids(struct task_struct left, struct* task_struct *right)
384	{
385	struct pid *pid1 = left->thread_pid;
386	struct pid *pid2 = right->thread_pid;
387	struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
388	struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
389
390	lockdep_assert_held_write(&tasklist_lock);
391
392	/ Swap the single entry tid lists /
393	hlists_swap_heads_rcu(left: head1, right: head2);
394
395	/ Swap the per task_struct pid /
396	rcu_assign_pointer(left->thread_pid, pid2);
397	rcu_assign_pointer(right->thread_pid, pid1);
398
399	/ Swap the cached value /
400	WRITE_ONCE(left->pid, pid_nr(pid2));
401	WRITE_ONCE(right->pid, pid_nr(pid1));
402	}
403
404	/ transfer_pid is an optimization of attach_pid(new), detach_pid(old) /
405	void transfer_pid(struct task_struct old, struct* task_struct *new,
406	enum pid_type type)
407	{
408	WARN_ON_ONCE(type == PIDTYPE_PID);
409	lockdep_assert_held_write(&tasklist_lock);
410	hlist_replace_rcu(old: &old->pid_links[type], new: &new->pid_links[type]);
411	}
412
413	struct task_struct pid_task(struct* pid pid, enum* pid_type type)
414	{
415	struct task_struct *result = NULL;
416	if (pid) {
417	struct hlist_node *first;
418	first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
419	lockdep_tasklist_lock_is_held());
420	if (first)
421	result = hlist_entry(first, struct task_struct, pid_links[(type)]);
422	}
423	return result;
424	}
425	EXPORT_SYMBOL(pid_task);
426
427	/*
428	* Must be called under rcu_read_lock().
429	*/
430	struct task_struct find_task_by_pid_ns(pid_t nr, struct* pid_namespace *ns)
431	{
432	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
433	"find_task_by_pid_ns() needs rcu_read_lock() protection");
434	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
435	}
436
437	struct task_struct *find_task_by_vpid(pid_t vnr)
438	{
439	return find_task_by_pid_ns(nr: vnr, ns: task_active_pid_ns(current));
440	}
441
442	struct task_struct *find_get_task_by_vpid(pid_t nr)
443	{
444	struct task_struct *task;
445
446	rcu_read_lock();
447	task = find_task_by_vpid(vnr: nr);
448	if (task)
449	get_task_struct(t: task);
450	rcu_read_unlock();
451
452	return task;
453	}
454
455	struct pid get_task_pid(struct* task_struct task, enum* pid_type type)
456	{
457	struct pid *pid;
458	rcu_read_lock();
459	pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
460	rcu_read_unlock();
461	return pid;
462	}
463	EXPORT_SYMBOL_GPL(get_task_pid);
464
465	struct task_struct get_pid_task(struct* pid pid, enum* pid_type type)
466	{
467	struct task_struct *result;
468	rcu_read_lock();
469	result = pid_task(pid, type);
470	if (result)
471	get_task_struct(t: result);
472	rcu_read_unlock();
473	return result;
474	}
475	EXPORT_SYMBOL_GPL(get_pid_task);
476
477	struct pid *find_get_pid(pid_t nr)
478	{
479	struct pid *pid;
480
481	rcu_read_lock();
482	pid = get_pid(pid: find_vpid(nr));
483	rcu_read_unlock();
484
485	return pid;
486	}
487	EXPORT_SYMBOL_GPL(find_get_pid);
488
489	pid_t pid_nr_ns(struct pid pid, struct* pid_namespace *ns)
490	{
491	struct upid *upid;
492	pid_t nr = `0`;
493
494	if (pid && ns->level <= pid->level) {
495	upid = &pid->numbers[ns->level];
496	if (upid->ns == ns)
497	nr = upid->nr;
498	}
499	return nr;
500	}
501	EXPORT_SYMBOL_GPL(pid_nr_ns);
502
503	pid_t pid_vnr(struct pid *pid)
504	{
505	return pid_nr_ns(pid, task_active_pid_ns(current));
506	}
507	EXPORT_SYMBOL_GPL(pid_vnr);
508
509	pid_t __task_pid_nr_ns(struct task_struct task, enum* pid_type type,
510	struct pid_namespace *ns)
511	{
512	pid_t nr = `0`;
513
514	rcu_read_lock();
515	if (!ns)
516	ns = task_active_pid_ns(current);
517	nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
518	rcu_read_unlock();
519
520	return nr;
521	}
522	EXPORT_SYMBOL(__task_pid_nr_ns);
523
524	struct pid_namespace task_active_pid_ns(struct* task_struct *tsk)
525	{
526	return ns_of_pid(pid: task_pid(task: tsk));
527	}
528	EXPORT_SYMBOL_GPL(task_active_pid_ns);
529
530	/*
531	* Used by proc to find the first pid that is greater than or equal to nr.
532	*
533	* If there is a pid at nr this function is exactly the same as find_pid_ns.
534	*/
535	struct pid find_ge_pid(int* nr, struct pid_namespace *ns)
536	{
537	return idr_get_next(&ns->idr, nextid: &nr);
538	}
539	EXPORT_SYMBOL_GPL(find_ge_pid);
540
541	struct pid pidfd_get_pid(unsigned* int fd, unsigned int *flags)
542	{
543	CLASS(fd, f)(fd);
544	struct pid *pid;
545
546	if (fd_empty(f))
547	return ERR_PTR(error: -EBADF);
548
549	pid = pidfd_pid(fd_file(f));
550	if (!IS_ERR(ptr: pid)) {
551	get_pid(pid);
552	*flags = fd_file(f)->f_flags;
553	}
554	return pid;
555	}
556
557	/**
558	* pidfd_get_task() - Get the task associated with a pidfd
559	*
560	* @pidfd: pidfd for which to get the task
561	* @flags: flags associated with this pidfd
562	*
563	* Return the task associated with @pidfd. The function takes a reference on
564	* the returned task. The caller is responsible for releasing that reference.
565	*
566	* Return: On success, the task_struct associated with the pidfd.
567	* On error, a negative errno number will be returned.
568	*/
569	struct task_struct pidfd_get_task(int* pidfd, unsigned int *flags)
570	{
571	unsigned int f_flags = `0`;
572	struct pid *pid;
573	struct task_struct *task;
574	enum pid_type type;
575
576	switch (pidfd) {
577	case PIDFD_SELF_THREAD:
578	type = PIDTYPE_PID;
579	pid = get_task_pid(current, type);
580	break;
581	case PIDFD_SELF_THREAD_GROUP:
582	type = PIDTYPE_TGID;
583	pid = get_task_pid(current, type);
584	break;
585	default:
586	pid = pidfd_get_pid(fd: pidfd, flags: &f_flags);
587	if (IS_ERR(ptr: pid))
588	return ERR_CAST(ptr: pid);
589	type = PIDTYPE_TGID;
590	break;
591	}
592
593	task = get_pid_task(pid, type);
594	put_pid(pid);
595	if (!task)
596	return ERR_PTR(error: -ESRCH);
597
598	*flags = f_flags;
599	return task;
600	}
601
602	/**
603	* pidfd_create() - Create a new pid file descriptor.
604	*
605	* @pid: struct pid that the pidfd will reference
606	* @flags: flags to pass
607	*
608	* This creates a new pid file descriptor with the O_CLOEXEC flag set.
609	*
610	* Note, that this function can only be called after the fd table has
611	* been unshared to avoid leaking the pidfd to the new process.
612	*
613	* This symbol should not be explicitly exported to loadable modules.
614	*
615	* Return: On success, a cloexec pidfd is returned.
616	* On error, a negative errno number will be returned.
617	*/
618	static int pidfd_create(struct pid pid, unsigned* int flags)
619	{
620	int pidfd;
621	struct file *pidfd_file;
622
623	pidfd = pidfd_prepare(pid, flags, ret_file: &pidfd_file);
624	if (pidfd < `0`)
625	return pidfd;
626
627	fd_install(fd: pidfd, file: pidfd_file);
628	return pidfd;
629	}
630
631	/**
632	* sys_pidfd_open() - Open new pid file descriptor.
633	*
634	* @pid: pid for which to retrieve a pidfd
635	* @flags: flags to pass
636	*
637	* This creates a new pid file descriptor with the O_CLOEXEC flag set for
638	* the task identified by @pid. Without PIDFD_THREAD flag the target task
639	* must be a thread-group leader.
640	*
641	* Return: On success, a cloexec pidfd is returned.
642	* On error, a negative errno number will be returned.
643	*/
644	SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
645	{
646	int fd;
647	struct pid *p;
648
649	if (flags & ~(PIDFD_NONBLOCK \| PIDFD_THREAD))
650	return -EINVAL;
651
652	if (pid <= `0`)
653	return -EINVAL;
654
655	p = find_get_pid(pid);
656	if (!p)
657	return -ESRCH;
658
659	fd = pidfd_create(pid: p, flags);
660
661	put_pid(p);
662	return fd;
663	}
664
665	#ifdef CONFIG_SYSCTL
666	static struct ctl_table_set pid_table_root_lookup(struct* ctl_table_root *root)
667	{
668	return &task_active_pid_ns(current)->set;
669	}
670
671	static int set_is_seen(struct ctl_table_set *set)
672	{
673	return &task_active_pid_ns(current)->set == set;
674	}
675
676	static int pid_table_root_permissions(struct ctl_table_header *head,
677	const struct ctl_table *table)
678	{
679	struct pid_namespace *pidns =
680	container_of(head->set, struct pid_namespace, set);
681	int mode = table->mode;
682
683	if (ns_capable(ns: pidns->user_ns, CAP_SYS_ADMIN) \|\|
684	uid_eq(current_euid(), right: make_kuid(from: pidns->user_ns, uid: `0`)))
685	mode = (mode & S_IRWXU) >> `6`;
686	else if (in_egroup_p(make_kgid(from: pidns->user_ns, gid: `0`)))
687	mode = (mode & S_IRWXG) >> `3`;
688	else
689	mode = mode & S_IROTH;
690	return (mode << `6`) \| (mode << `3`) \| mode;
691	}
692
693	static void pid_table_root_set_ownership(struct ctl_table_header *head,
694	kuid_t uid, kgid_t gid)
695	{
696	struct pid_namespace *pidns =
697	container_of(head->set, struct pid_namespace, set);
698	kuid_t ns_root_uid;
699	kgid_t ns_root_gid;
700
701	ns_root_uid = make_kuid(from: pidns->user_ns, uid: `0`);
702	if (uid_valid(uid: ns_root_uid))
703	*uid = ns_root_uid;
704
705	ns_root_gid = make_kgid(from: pidns->user_ns, gid: `0`);
706	if (gid_valid(gid: ns_root_gid))
707	*gid = ns_root_gid;
708	}
709
710	static struct ctl_table_root pid_table_root = {
711	.lookup = pid_table_root_lookup,
712	.permissions = pid_table_root_permissions,
713	.set_ownership = pid_table_root_set_ownership,
714	};
715
716	static const struct ctl_table pid_table[] = {
717	{
718	.procname = "pid_max",
719	.data = &init_pid_ns.pid_max,
720	.maxlen = sizeof(int),
721	.mode = `0644`,
722	.proc_handler = proc_dointvec_minmax,
723	.extra1 = &pid_max_min,
724	.extra2 = &pid_max_max,
725	},
726	};
727	#endif
728
729	int register_pidns_sysctls(struct pid_namespace *pidns)
730	{
731	#ifdef CONFIG_SYSCTL
732	struct ctl_table *tbl;
733
734	setup_sysctl_set(p: &pidns->set, root: &pid_table_root, is_seen: set_is_seen);
735
736	tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
737	if (!tbl)
738	return -ENOMEM;
739	tbl->data = &pidns->pid_max;
740	pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
741	PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
742
743	pidns->sysctls = __register_sysctl_table(set: &pidns->set, path: "kernel", table: tbl,
744	ARRAY_SIZE(pid_table));
745	if (!pidns->sysctls) {
746	kfree(objp: tbl);
747	retire_sysctl_set(set: &pidns->set);
748	return -ENOMEM;
749	}
750	#endif
751	return `0`;
752	}
753
754	void unregister_pidns_sysctls(struct pid_namespace *pidns)
755	{
756	#ifdef CONFIG_SYSCTL
757	const struct ctl_table *tbl;
758
759	tbl = pidns->sysctls->ctl_table_arg;
760	unregister_sysctl_table(table: pidns->sysctls);
761	retire_sysctl_set(set: &pidns->set);
762	kfree(objp: tbl);
763	#endif
764	}
765
766	void __init pid_idr_init(void)
767	{
768	/ Verify no one has done anything silly: /
769	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
770
771	/ bump default and minimum pid_max based on number of cpus /
772	init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
773	PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
774	pid_max_min = max_t(int, pid_max_min,
775	PIDS_PER_CPU_MIN * num_possible_cpus());
776	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
777
778	idr_init(idr: &init_pid_ns.idr);
779
780	init_pid_ns.pid_cachep = kmem_cache_create("pid",
781	struct_size_t(struct pid, numbers, `1`),
782	__alignof__(struct pid),
783	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \| SLAB_ACCOUNT,
784	NULL);
785	}
786
787	static __init int pid_namespace_sysctl_init(void)
788	{
789	#ifdef CONFIG_SYSCTL
790	/ "kernel" directory will have already been initialized. /
791	BUG_ON(register_pidns_sysctls(&init_pid_ns));
792	#endif
793	return `0`;
794	}
795	subsys_initcall(pid_namespace_sysctl_init);
796
797	static struct file __pidfd_fget(struct* task_struct task, int* fd)
798	{
799	struct file *file;
800	int ret;
801
802	ret = down_read_killable(sem: &task->signal->exec_update_lock);
803	if (ret)
804	return ERR_PTR(error: ret);
805
806	if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
807	file = fget_task(task, fd);
808	else
809	file = ERR_PTR(error: -EPERM);
810
811	up_read(sem: &task->signal->exec_update_lock);
812
813	if (!file) {
814	/*
815	* It is possible that the target thread is exiting; it can be
816	* either:
817	* 1. before exit_signals(), which gives a real fd
818	* 2. before exit_files() takes the task_lock() gives a real fd
819	* 3. after exit_files() releases task_lock(), ->files is NULL;
820	* this has PF_EXITING, since it was set in exit_signals(),
821	* __pidfd_fget() returns EBADF.
822	* In case 3 we get EBADF, but that really means ESRCH, since
823	* the task is currently exiting and has freed its files
824	* struct, so we fix it up.
825	*/
826	if (task->flags & PF_EXITING)
827	file = ERR_PTR(error: -ESRCH);
828	else
829	file = ERR_PTR(error: -EBADF);
830	}
831
832	return file;
833	}
834
835	static int pidfd_getfd(struct pid pid, int* fd)
836	{
837	struct task_struct *task;
838	struct file *file;
839	int ret;
840
841	task = get_pid_task(pid, PIDTYPE_PID);
842	if (!task)
843	return -ESRCH;
844
845	file = __pidfd_fget(task, fd);
846	put_task_struct(t: task);
847	if (IS_ERR(ptr: file))
848	return PTR_ERR(ptr: file);
849
850	ret = receive_fd(file, NULL, O_CLOEXEC);
851	fput(file);
852
853	return ret;
854	}
855
856	/**
857	* sys_pidfd_getfd() - Get a file descriptor from another process
858	*
859	* @pidfd: the pidfd file descriptor of the process
860	* @fd: the file descriptor number to get
861	* @flags: flags on how to get the fd (reserved)
862	*
863	* This syscall gets a copy of a file descriptor from another process
864	* based on the pidfd, and file descriptor number. It requires that
865	* the calling process has the ability to ptrace the process represented
866	* by the pidfd. The process which is having its file descriptor copied
867	* is otherwise unaffected.
868	*
869	* Return: On success, a cloexec file descriptor is returned.
870	* On error, a negative errno number will be returned.
871	*/
872	SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
873	unsigned int, flags)
874	{
875	struct pid *pid;
876
877	/ flags is currently unused - make sure it's unset /
878	if (flags)
879	return -EINVAL;
880
881	CLASS(fd, f)(fd: pidfd);
882	if (fd_empty(f))
883	return -EBADF;
884
885	pid = pidfd_pid(fd_file(f));
886	if (IS_ERR(ptr: pid))
887	return PTR_ERR(ptr: pid);
888
889	return pidfd_getfd(pid, fd);
890	}
891

source code of linux/kernel/pid.c