1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. |
4 | */ |
5 | |
6 | #include <linux/array_size.h> |
7 | #include <linux/minmax.h> |
8 | #include <vdso/datapage.h> |
9 | #include <vdso/getrandom.h> |
10 | #include <vdso/unaligned.h> |
11 | #include <asm/vdso/getrandom.h> |
12 | #include <uapi/linux/mman.h> |
13 | #include <uapi/linux/random.h> |
14 | |
15 | /* Bring in default accessors */ |
16 | #include <vdso/vsyscall.h> |
17 | |
18 | #undef PAGE_SIZE |
19 | #undef PAGE_MASK |
20 | #define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT) |
21 | #define PAGE_MASK (~(PAGE_SIZE - 1)) |
22 | |
23 | #define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \ |
24 | while (len >= sizeof(type)) { \ |
25 | __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \ |
26 | __put_unaligned_t(type, 0, src); \ |
27 | dst += sizeof(type); \ |
28 | src += sizeof(type); \ |
29 | len -= sizeof(type); \ |
30 | } \ |
31 | } while (0) |
32 | |
33 | static void memcpy_and_zero_src(void *dst, void *src, size_t len) |
34 | { |
35 | if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { |
36 | if (IS_ENABLED(CONFIG_64BIT)) |
37 | MEMCPY_AND_ZERO_SRC(u64, dst, src, len); |
38 | MEMCPY_AND_ZERO_SRC(u32, dst, src, len); |
39 | MEMCPY_AND_ZERO_SRC(u16, dst, src, len); |
40 | } |
41 | MEMCPY_AND_ZERO_SRC(u8, dst, src, len); |
42 | } |
43 | |
44 | /** |
45 | * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall. |
46 | * @rng_info: Describes state of kernel RNG, memory shared with kernel. |
47 | * @buffer: Destination buffer to fill with random bytes. |
48 | * @len: Size of @buffer in bytes. |
49 | * @flags: Zero or more GRND_* flags. |
50 | * @opaque_state: Pointer to an opaque state area. |
51 | * @opaque_len: Length of opaque state area. |
52 | * |
53 | * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's |
54 | * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same |
55 | * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always |
56 | * calls into the syscall. |
57 | * |
58 | * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated |
59 | * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0, |
60 | * this function should not be used. |
61 | * |
62 | * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields |
63 | * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external |
64 | * locking is used, one state must be allocated per thread, as it is not safe to call this function |
65 | * concurrently with the same @opaque_state. However, it is safe to call this using the same |
66 | * @opaque_state that is shared between main code and signal handling code, within the same thread. |
67 | * |
68 | * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. |
69 | */ |
70 | static __always_inline ssize_t |
71 | __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len, |
72 | unsigned int flags, void *opaque_state, size_t opaque_len) |
73 | { |
74 | ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len); |
75 | struct vgetrandom_state *state = opaque_state; |
76 | size_t batch_len, nblocks, orig_len = len; |
77 | bool in_use, have_retried = false; |
78 | void *orig_buffer = buffer; |
79 | u64 current_generation; |
80 | u32 counter[2] = { 0 }; |
81 | |
82 | if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { |
83 | struct vgetrandom_opaque_params *params = opaque_state; |
84 | params->size_of_opaque_state = sizeof(*state); |
85 | params->mmap_prot = PROT_READ | PROT_WRITE; |
86 | params->mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS; |
87 | for (size_t i = 0; i < ARRAY_SIZE(params->reserved); ++i) |
88 | params->reserved[i] = 0; |
89 | return 0; |
90 | } |
91 | |
92 | /* The state must not straddle a page, since pages can be zeroed at any time. */ |
93 | if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE)) |
94 | return -EFAULT; |
95 | |
96 | /* Handle unexpected flags by falling back to the kernel. */ |
97 | if (unlikely(flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))) |
98 | goto fallback_syscall; |
99 | |
100 | /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */ |
101 | if (unlikely(opaque_len != sizeof(*state))) |
102 | goto fallback_syscall; |
103 | |
104 | /* |
105 | * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from |
106 | * userspace, because A) the various @flags require this to block, or not, depending on |
107 | * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is |
108 | * ready is to reseed from the entropy pool at every invocation. |
109 | */ |
110 | if (unlikely(!READ_ONCE(rng_info->is_ready))) |
111 | goto fallback_syscall; |
112 | |
113 | /* |
114 | * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is |
115 | * initialized, the @flags parameter may require this to block or return an error, even when |
116 | * len is zero. |
117 | */ |
118 | if (unlikely(!len)) |
119 | return 0; |
120 | |
121 | /* |
122 | * @state->in_use is basic reentrancy protection against this running in a signal handler |
123 | * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one |
124 | * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before |
125 | * writing @state->in_use, there is still no race, because the signal handler will run to |
126 | * its completion before returning execution. |
127 | */ |
128 | in_use = READ_ONCE(state->in_use); |
129 | if (unlikely(in_use)) |
130 | /* The syscall simply fills the buffer and does not touch @state, so fallback. */ |
131 | goto fallback_syscall; |
132 | WRITE_ONCE(state->in_use, true); |
133 | |
134 | retry_generation: |
135 | /* |
136 | * @rng_info->generation must always be read here, as it serializes @state->key with the |
137 | * kernel's RNG reseeding schedule. |
138 | */ |
139 | current_generation = READ_ONCE(rng_info->generation); |
140 | |
141 | /* |
142 | * If @state->generation doesn't match the kernel RNG's generation, then it means the |
143 | * kernel's RNG has reseeded, and so @state->key is reseeded as well. |
144 | */ |
145 | if (unlikely(state->generation != current_generation)) { |
146 | /* |
147 | * Write the generation before filling the key, in case of fork. If there is a fork |
148 | * just after this line, the parent and child will get different random bytes from |
149 | * the syscall, which is good. However, were this line to occur after the getrandom |
150 | * syscall, then both child and parent could have the same bytes and the same |
151 | * generation counter, so the fork would not be detected. Therefore, write |
152 | * @state->generation before the call to the getrandom syscall. |
153 | */ |
154 | WRITE_ONCE(state->generation, current_generation); |
155 | |
156 | /* |
157 | * Prevent the syscall from being reordered wrt current_generation. Pairs with the |
158 | * smp_store_release(&vdso_k_rng_data->generation) in random.c. |
159 | */ |
160 | smp_rmb(); |
161 | |
162 | /* Reseed @state->key using fresh bytes from the kernel. */ |
163 | if (getrandom_syscall(buffer: state->key, len: sizeof(state->key), flags: 0) != sizeof(state->key)) { |
164 | /* |
165 | * If the syscall failed to refresh the key, then @state->key is now |
166 | * invalid, so invalidate the generation so that it is not used again, and |
167 | * fallback to using the syscall entirely. |
168 | */ |
169 | WRITE_ONCE(state->generation, 0); |
170 | |
171 | /* |
172 | * Set @state->in_use to false only after the last write to @state in the |
173 | * line above. |
174 | */ |
175 | WRITE_ONCE(state->in_use, false); |
176 | |
177 | goto fallback_syscall; |
178 | } |
179 | |
180 | /* |
181 | * Set @state->pos to beyond the end of the batch, so that the batch is refilled |
182 | * using the new key. |
183 | */ |
184 | state->pos = sizeof(state->batch); |
185 | } |
186 | |
187 | /* Set len to the total amount of bytes that this function is allowed to read, ret. */ |
188 | len = ret; |
189 | more_batch: |
190 | /* |
191 | * First use bytes out of @state->batch, which may have been filled by the last call to this |
192 | * function. |
193 | */ |
194 | batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len); |
195 | if (batch_len) { |
196 | /* Zeroing at the same time as memcpying helps preserve forward secrecy. */ |
197 | memcpy_and_zero_src(dst: buffer, src: state->batch + state->pos, len: batch_len); |
198 | state->pos += batch_len; |
199 | buffer += batch_len; |
200 | len -= batch_len; |
201 | } |
202 | |
203 | if (!len) { |
204 | /* Prevent the loop from being reordered wrt ->generation. */ |
205 | barrier(); |
206 | |
207 | /* |
208 | * Since @rng_info->generation will never be 0, re-read @state->generation, rather |
209 | * than using the local current_generation variable, to learn whether a fork |
210 | * occurred or if @state was zeroed due to memory pressure. Primarily, though, this |
211 | * indicates whether the kernel's RNG has reseeded, in which case generate a new key |
212 | * and start over. |
213 | */ |
214 | if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) { |
215 | /* |
216 | * Prevent this from looping forever in case of low memory or racing with a |
217 | * user force-reseeding the kernel's RNG using the ioctl. |
218 | */ |
219 | if (have_retried) { |
220 | WRITE_ONCE(state->in_use, false); |
221 | goto fallback_syscall; |
222 | } |
223 | |
224 | have_retried = true; |
225 | buffer = orig_buffer; |
226 | goto retry_generation; |
227 | } |
228 | |
229 | /* |
230 | * Set @state->in_use to false only when there will be no more reads or writes of |
231 | * @state. |
232 | */ |
233 | WRITE_ONCE(state->in_use, false); |
234 | return ret; |
235 | } |
236 | |
237 | /* Generate blocks of RNG output directly into @buffer while there's enough room left. */ |
238 | nblocks = len / CHACHA_BLOCK_SIZE; |
239 | if (nblocks) { |
240 | __arch_chacha20_blocks_nostack(dst_bytes: buffer, key: state->key, counter, nblocks); |
241 | buffer += nblocks * CHACHA_BLOCK_SIZE; |
242 | len -= nblocks * CHACHA_BLOCK_SIZE; |
243 | } |
244 | |
245 | BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0); |
246 | |
247 | /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */ |
248 | __arch_chacha20_blocks_nostack(dst_bytes: state->batch_key, key: state->key, counter, |
249 | nblocks: sizeof(state->batch_key) / CHACHA_BLOCK_SIZE); |
250 | |
251 | /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */ |
252 | state->pos = 0; |
253 | goto more_batch; |
254 | |
255 | fallback_syscall: |
256 | return getrandom_syscall(buffer: orig_buffer, len: orig_len, flags); |
257 | } |
258 | |
259 | static __always_inline ssize_t |
260 | __cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) |
261 | { |
262 | return __cvdso_getrandom_data(rng_info: __arch_get_vdso_u_rng_data(), buffer, len, flags, |
263 | opaque_state, opaque_len); |
264 | } |
265 | |