allocatestack.c source code [glibc/nptl/allocatestack.c]

1	/ Copyright (C) 2002-2022 Free Software Foundation, Inc.*
2	This file is part of the GNU C Library.
3
4	The GNU C Library is free software; you can redistribute it and/or
5	modify it under the terms of the GNU Lesser General Public
6	License as published by the Free Software Foundation; either
7	version 2.1 of the License, or (at your option) any later version.
8
9	The GNU C Library is distributed in the hope that it will be useful,
10	but WITHOUT ANY WARRANTY; without even the implied warranty of
11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	Lesser General Public License for more details.
13
14	You should have received a copy of the GNU Lesser General Public
15	License along with the GNU C Library; if not, see
16	<https://www.gnu.org/licenses/>. /*
17
18	#include <assert.h>
19	#include <errno.h>
20	#include <signal.h>
21	#include <stdint.h>
22	#include <string.h>
23	#include <unistd.h>
24	#include <sys/mman.h>
25	#include <sys/param.h>
26	#include <dl-sysdep.h>
27	#include <dl-tls.h>
28	#include <tls.h>
29	#include <list.h>
30	#include <lowlevellock.h>
31	#include <futex-internal.h>
32	#include <kernel-features.h>
33	#include <nptl-stack.h>
34	#include <libc-lock.h>
35
36	/ Default alignment of stack. /
37	#ifndef STACK_ALIGN
38	# define STACK_ALIGN __alignof__ (long double)
39	#endif
40
41	/ Default value for minimal stack size after allocating thread*
42	descriptor and guard. /*
43	#ifndef MINIMAL_REST_STACK
44	# define MINIMAL_REST_STACK 4096
45	#endif
46
47
48	/ Newer kernels have the MAP_STACK flag to indicate a mapping is used for*
49	a stack. Use it when possible. /*
50	#ifndef MAP_STACK
51	# define MAP_STACK 0
52	#endif
53
54	/ Get a stack frame from the cache. We have to match by size since*
55	some blocks might be too small or far too large. /*
56	static struct pthread *
57	get_cached_stack (size_t sizep, void* **memp)
58	{
59	size_t size = *sizep;
60	struct pthread *result = NULL;
61	list_t *entry;
62
63	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
64
65	/ Search the cache for a matching entry. We search for the*
66	smallest stack which has at least the required size. Note that
67	in normal situations the size of all allocated stacks is the
68	same. As the very least there are only a few different sizes.
69	Therefore this loop will exit early most of the time with an
70	exact match. /*
71	list_for_each (entry, &GL (dl_stack_cache))
72	{
73	struct pthread *curr;
74
75	curr = list_entry (entry, struct pthread, list);
76	if (__nptl_stack_in_use (pd: curr) && curr->stackblock_size >= size)
77	{
78	if (curr->stackblock_size == size)
79	{
80	result = curr;
81	break;
82	}
83
84	if (result == NULL
85	\|\| result->stackblock_size > curr->stackblock_size)
86	result = curr;
87	}
88	}
89
90	if (__builtin_expect (result == NULL, `0`)
91	/ Make sure the size difference is not too excessive. In that*
92	case we do not use the block. /*
93	\|\| __builtin_expect (result->stackblock_size > `4` * size, `0`))
94	{
95	/ Release the lock. /
96	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
97
98	return NULL;
99	}
100
101	/ Don't allow setxid until cloned. /
102	result->setxid_futex = -`1`;
103
104	/ Dequeue the entry. /
105	__nptl_stack_list_del (&result->list);
106
107	/ And add to the list of stacks in use. /
108	__nptl_stack_list_add (&result->list, &GL (dl_stack_used));
109
110	/ And decrease the cache size. /
111	GL (dl_stack_cache_actsize) -= result->stackblock_size;
112
113	/ Release the lock early. /
114	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
115
116	/ Report size and location of the stack to the caller. /
117	*sizep = result->stackblock_size;
118	*memp = result->stackblock;
119
120	/ Cancellation handling is back to the default. /
121	result->cancelhandling = `0`;
122	result->cleanup = NULL;
123	result->setup_failed = `0`;
124
125	/ No pending event. /
126	result->nextevent = NULL;
127
128	result->exiting = false;
129	__libc_lock_init (result->exit_lock);
130	result->tls_state = (struct tls_internal_t) { `0` };
131
132	/ Clear the DTV. /
133	dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
134	for (size_t cnt = `0`; cnt < dtv[-`1`].counter; ++cnt)
135	free (dtv[`1` + cnt].pointer.to_free);
136	memset (dtv, `'\0'`, (dtv[-`1`].counter + `1`) * sizeof (dtv_t));
137
138	/ Re-initialize the TLS. /
139	_dl_allocate_tls_init (TLS_TPADJ (result), true);
140
141	return result;
142	}
143
144	/ Return the guard page position on allocated stack. /
145	static inline char *
146	__attribute ((always_inline))
147	guard_position (void mem, size_t size, size_t guardsize, struct* pthread *pd,
148	size_t pagesize_m1)
149	{
150	#ifdef NEED_SEPARATE_REGISTER_STACK
151	return mem + (((size - guardsize) / `2`) & ~pagesize_m1);
152	#elif _STACK_GROWS_DOWN
153	return mem;
154	#elif _STACK_GROWS_UP
155	return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
156	#endif
157	}
158
159	/ Based on stack allocated with PROT_NONE, setup the required portions with*
160	'prot' flags based on the guard page position. /*
161	static inline int
162	setup_stack_prot (char mem, size_t size, char* *guard, size_t guardsize,
163	const int prot)
164	{
165	char *guardend = guard + guardsize;
166	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
167	/ As defined at guard_position, for architectures with downward stack*
168	the guard page is always at start of the allocated area. /*
169	if (__mprotect (guardend, size - guardsize, prot) != `0`)
170	return errno;
171	#else
172	size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
173	if (__mprotect (mem, mprots1, prot) != `0`)
174	return errno;
175	size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
176	if (__mprotect (guardend, mprots2, prot) != `0`)
177	return errno;
178	#endif
179	return `0`;
180	}
181
182	/ Mark the memory of the stack as usable to the kernel. It frees everything*
183	except for the space used for the TCB itself. /*
184	static __always_inline void
185	advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
186	{
187	uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
188	size_t pagesize_m1 = __getpagesize () - `1`;
189	#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
190	size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
191	assert (freesize < size);
192	if (freesize > PTHREAD_STACK_MIN)
193	__madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
194	#else
195	/ Page aligned start of memory to free (higher than or equal*
196	to current sp plus the minimum stack size). /*
197	uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
198	uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
199	if (free_end > freeblock)
200	{
201	size_t freesize = free_end - freeblock;
202	assert (freesize < size);
203	__madvise ((void*) freeblock, freesize, MADV_DONTNEED);
204	}
205	#endif
206	}
207
208	/ Returns a usable stack for a new thread either by allocating a*
209	new stack or reusing a cached stack of sufficient size.
210	ATTR must be non-NULL and point to a valid pthread_attr.
211	PDP must be non-NULL. /*
212	static int
213	allocate_stack (const struct pthread_attr attr, struct* pthread **pdp,
214	void *stack, size_t stacksize)
215	{
216	struct pthread *pd;
217	size_t size;
218	size_t pagesize_m1 = __getpagesize () - `1`;
219	size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
220	size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - `1`;
221
222	assert (powerof2 (pagesize_m1 + `1`));
223	assert (TCB_ALIGNMENT >= STACK_ALIGN);
224
225	/ Get the stack size from the attribute if it is set. Otherwise we*
226	use the default we determined at start time. /*
227	if (attr->stacksize != `0`)
228	size = attr->stacksize;
229	else
230	{
231	lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
232	size = __default_pthread_attr.internal.stacksize;
233	lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
234	}
235
236	/ Get memory for the stack. /
237	if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
238	{
239	uintptr_t adj;
240	char stackaddr = (char* *) attr->stackaddr;
241
242	/ Assume the same layout as the _STACK_GROWS_DOWN case, with struct*
243	pthread at the top of the stack block. Later we adjust the guard
244	location and stack address to match the _STACK_GROWS_UP case. /*
245	if (_STACK_GROWS_UP)
246	stackaddr += attr->stacksize;
247
248	/ If the user also specified the size of the stack make sure it*
249	is large enough. /*
250	if (attr->stacksize != `0`
251	&& attr->stacksize < (tls_static_size_for_stack
252	+ MINIMAL_REST_STACK))
253	return EINVAL;
254
255	/ Adjust stack size for alignment of the TLS block. /
256	#if TLS_TCB_AT_TP
257	adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
258	& tls_static_align_m1;
259	assert (size > adj + TLS_TCB_SIZE);
260	#elif TLS_DTV_AT_TP
261	adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
262	& tls_static_align_m1;
263	assert (size > adj);
264	#endif
265
266	/ The user provided some memory. Let's hope it matches the*
267	size... We do not allocate guard pages if the user provided
268	the stack. It is the user's responsibility to do this if it
269	is wanted. /*
270	#if TLS_TCB_AT_TP
271	pd = (struct pthread *) ((uintptr_t) stackaddr
272	- TLS_TCB_SIZE - adj);
273	#elif TLS_DTV_AT_TP
274	pd = (struct pthread *) (((uintptr_t) stackaddr
275	- tls_static_size_for_stack - adj)
276	- TLS_PRE_TCB_SIZE);
277	#endif
278
279	/ The user provided stack memory needs to be cleared. /
280	memset (pd, `'\0'`, sizeof (struct pthread));
281
282	/ The first TSD block is included in the TCB. /
283	pd->specific[`0`] = pd->specific_1stblock;
284
285	/ Remember the stack-related values. /
286	pd->stackblock = (char *) stackaddr - size;
287	pd->stackblock_size = size;
288
289	/ This is a user-provided stack. It will not be queued in the*
290	stack cache nor will the memory (except the TLS memory) be freed. /*
291	pd->user_stack = true;
292
293	/ This is at least the second thread. /
294	pd->header.multiple_threads = `1`;
295	#ifndef TLS_MULTIPLE_THREADS_IN_TCB
296	__libc_multiple_threads = `1`;
297	#endif
298
299	#ifdef NEED_DL_SYSINFO
300	SETUP_THREAD_SYSINFO (pd);
301	#endif
302
303	/ Don't allow setxid until cloned. /
304	pd->setxid_futex = -`1`;
305
306	/ Allocate the DTV for this thread. /
307	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
308	{
309	/ Something went wrong. /
310	assert (errno == ENOMEM);
311	return errno;
312	}
313
314
315	/ Prepare to modify global data. /
316	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
317
318	/ And add to the list of stacks in use. /
319	list_add (newp: &pd->list, head: &GL (dl_stack_user));
320
321	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
322	}
323	else
324	{
325	/ Allocate some anonymous memory. If possible use the cache. /
326	size_t guardsize;
327	size_t reported_guardsize;
328	size_t reqsize;
329	void *mem;
330	const int prot = (PROT_READ \| PROT_WRITE
331	\| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : `0`));
332
333	/ Adjust the stack size for alignment. /
334	size &= ~tls_static_align_m1;
335	assert (size != `0`);
336
337	/ Make sure the size of the stack is enough for the guard and*
338	eventually the thread descriptor. On some targets there is
339	a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
340	internally enforce it (unless the guard was disabled), but
341	report the original guard size for backward compatibility:
342	before POSIX 2008 the guardsize was specified to be one page
343	by default which is observable via pthread_attr_getguardsize
344	and pthread_getattr_np. /*
345	guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
346	reported_guardsize = guardsize;
347	if (guardsize > `0` && guardsize < ARCH_MIN_GUARD_SIZE)
348	guardsize = ARCH_MIN_GUARD_SIZE;
349	if (guardsize < attr->guardsize \|\| size + guardsize < guardsize)
350	/ Arithmetic overflow. /
351	return EINVAL;
352	size += guardsize;
353	if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
354	+ MINIMAL_REST_STACK + pagesize_m1)
355	& ~pagesize_m1),
356	`0`))
357	/ The stack is too small (or the guard too large). /
358	return EINVAL;
359
360	/ Try to get a stack from the cache. /
361	reqsize = size;
362	pd = get_cached_stack (sizep: &size, memp: &mem);
363	if (pd == NULL)
364	{
365	/ If a guard page is required, avoid committing memory by first*
366	allocate with PROT_NONE and then reserve with required permission
367	excluding the guard page. /*
368	mem = __mmap (NULL, size, (guardsize == `0`) ? prot : PROT_NONE,
369	MAP_PRIVATE \| MAP_ANONYMOUS \| MAP_STACK, -`1`, `0`);
370
371	if (__glibc_unlikely (mem == MAP_FAILED))
372	return errno;
373
374	/ SIZE is guaranteed to be greater than zero.*
375	So we can never get a null pointer back from mmap. /*
376	assert (mem != NULL);
377
378	/ Place the thread descriptor at the end of the stack. /
379	#if TLS_TCB_AT_TP
380	pd = (struct pthread *) ((((uintptr_t) mem + size)
381	- TLS_TCB_SIZE)
382	& ~tls_static_align_m1);
383	#elif TLS_DTV_AT_TP
384	pd = (struct pthread *) ((((uintptr_t) mem + size
385	- tls_static_size_for_stack)
386	& ~tls_static_align_m1)
387	- TLS_PRE_TCB_SIZE);
388	#endif
389
390	/ Now mprotect the required region excluding the guard area. /
391	if (__glibc_likely (guardsize > `0`))
392	{
393	char *guard = guard_position (mem, size, guardsize, pd,
394	pagesize_m1);
395	if (setup_stack_prot (mem, size, guard, guardsize, prot) != `0`)
396	{
397	__munmap (mem, size);
398	return errno;
399	}
400	}
401
402	/ Remember the stack-related values. /
403	pd->stackblock = mem;
404	pd->stackblock_size = size;
405	/ Update guardsize for newly allocated guardsize to avoid*
406	an mprotect in guard resize below. /*
407	pd->guardsize = guardsize;
408
409	/ We allocated the first block thread-specific data array.*
410	This address will not change for the lifetime of this
411	descriptor. /*
412	pd->specific[`0`] = pd->specific_1stblock;
413
414	/ This is at least the second thread. /
415	pd->header.multiple_threads = `1`;
416	#ifndef TLS_MULTIPLE_THREADS_IN_TCB
417	__libc_multiple_threads = `1`;
418	#endif
419
420	#ifdef NEED_DL_SYSINFO
421	SETUP_THREAD_SYSINFO (pd);
422	#endif
423
424	/ Don't allow setxid until cloned. /
425	pd->setxid_futex = -`1`;
426
427	/ Allocate the DTV for this thread. /
428	if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
429	{
430	/ Something went wrong. /
431	assert (errno == ENOMEM);
432
433	/ Free the stack memory we just allocated. /
434	(void) __munmap (mem, size);
435
436	return errno;
437	}
438
439
440	/ Prepare to modify global data. /
441	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
442
443	/ And add to the list of stacks in use. /
444	__nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
445
446	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
447
448
449	/ There might have been a race. Another thread might have*
450	caused the stacks to get exec permission while this new
451	stack was prepared. Detect if this was possible and
452	change the permission if necessary. /*
453	if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != `0`
454	&& (prot & PROT_EXEC) == `0`, `0`))
455	{
456	int err = __nptl_change_stack_perm (pd);
457	if (err != `0`)
458	{
459	/ Free the stack memory we just allocated. /
460	(void) __munmap (mem, size);
461
462	return err;
463	}
464	}
465
466
467	/ Note that all of the stack and the thread descriptor is*
468	zeroed. This means we do not have to initialize fields
469	with initial value zero. This is specifically true for
470	the 'tid' field which is always set back to zero once the
471	stack is not used anymore and for the 'guardsize' field
472	which will be read next. /*
473	}
474
475	/ Create or resize the guard area if necessary. /
476	if (__glibc_unlikely (guardsize > pd->guardsize))
477	{
478	char *guard = guard_position (mem, size, guardsize, pd,
479	pagesize_m1);
480	if (__mprotect (guard, guardsize, PROT_NONE) != `0`)
481	{
482	mprot_error:
483	lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
484
485	/ Remove the thread from the list. /
486	__nptl_stack_list_del (&pd->list);
487
488	lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
489
490	/ Get rid of the TLS block we allocated. /
491	_dl_deallocate_tls (TLS_TPADJ (pd), false);
492
493	/ Free the stack memory regardless of whether the size*
494	of the cache is over the limit or not. If this piece
495	of memory caused problems we better do not use it
496	anymore. Uh, and we ignore possible errors. There
497	is nothing we could do. /*
498	(void) __munmap (mem, size);
499
500	return errno;
501	}
502
503	pd->guardsize = guardsize;
504	}
505	else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
506	`0`))
507	{
508	/ The old guard area is too large. /
509
510	#ifdef NEED_SEPARATE_REGISTER_STACK
511	char *guard = mem + (((size - guardsize) / `2`) & ~pagesize_m1);
512	char *oldguard = mem + (((size - pd->guardsize) / `2`) & ~pagesize_m1);
513
514	if (oldguard < guard
515	&& __mprotect (oldguard, guard - oldguard, prot) != `0`)
516	goto mprot_error;
517
518	if (__mprotect (guard + guardsize,
519	oldguard + pd->guardsize - guard - guardsize,
520	prot) != `0`)
521	goto mprot_error;
522	#elif _STACK_GROWS_DOWN
523	if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
524	prot) != `0`)
525	goto mprot_error;
526	#elif _STACK_GROWS_UP
527	char new_guard = (char* *)(((uintptr_t) pd - guardsize)
528	& ~pagesize_m1);
529	char old_guard = (char* *)(((uintptr_t) pd - pd->guardsize)
530	& ~pagesize_m1);
531	/ The guard size difference might be > 0, but once rounded*
532	to the nearest page the size difference might be zero. /*
533	if (new_guard > old_guard
534	&& __mprotect (old_guard, new_guard - old_guard, prot) != `0`)
535	goto mprot_error;
536	#endif
537
538	pd->guardsize = guardsize;
539	}
540	/ The pthread_getattr_np() calls need to get passed the size*
541	requested in the attribute, regardless of how large the
542	actually used guardsize is. /*
543	pd->reported_guardsize = reported_guardsize;
544	}
545
546	/ Initialize the lock. We have to do this unconditionally since the*
547	stillborn thread could be canceled while the lock is taken. /*
548	pd->lock = LLL_LOCK_INITIALIZER;
549
550	/ The robust mutex lists also need to be initialized*
551	unconditionally because the cleanup for the previous stack owner
552	might have happened in the kernel. /*
553	pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
554	- offsetof (pthread_mutex_t,
555	__data.__list.__next));
556	pd->robust_head.list_op_pending = NULL;
557	#if __PTHREAD_MUTEX_HAVE_PREV
558	pd->robust_prev = &pd->robust_head;
559	#endif
560	pd->robust_head.list = &pd->robust_head;
561
562	/ We place the thread descriptor at the end of the stack. /
563	*pdp = pd;
564
565	void *stacktop;
566
567	#if TLS_TCB_AT_TP
568	/ The stack begins before the TCB and the static TLS block. /
569	stacktop = ((char *) (pd + `1`) - tls_static_size_for_stack);
570	#elif TLS_DTV_AT_TP
571	stacktop = (char *) (pd - `1`);
572	#endif
573
574	*stacksize = stacktop - pd->stackblock;
575	*stack = pd->stackblock;
576
577	return `0`;
578	}
579

source code of glibc/nptl/allocatestack.c