1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | |
3 | #include <linux/stat.h> |
4 | #include <linux/sysctl.h> |
5 | #include <linux/slab.h> |
6 | #include <linux/cred.h> |
7 | #include <linux/hash.h> |
8 | #include <linux/kmemleak.h> |
9 | #include <linux/user_namespace.h> |
10 | |
11 | struct ucounts init_ucounts = { |
12 | .ns = &init_user_ns, |
13 | .uid = GLOBAL_ROOT_UID, |
14 | .count = ATOMIC_INIT(1), |
15 | }; |
16 | |
17 | #define UCOUNTS_HASHTABLE_BITS 10 |
18 | static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; |
19 | static DEFINE_SPINLOCK(ucounts_lock); |
20 | |
21 | #define ucounts_hashfn(ns, uid) \ |
22 | hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \ |
23 | UCOUNTS_HASHTABLE_BITS) |
24 | #define ucounts_hashentry(ns, uid) \ |
25 | (ucounts_hashtable + ucounts_hashfn(ns, uid)) |
26 | |
27 | |
28 | #ifdef CONFIG_SYSCTL |
29 | static struct ctl_table_set * |
30 | set_lookup(struct ctl_table_root *root) |
31 | { |
32 | return ¤t_user_ns()->set; |
33 | } |
34 | |
35 | static int set_is_seen(struct ctl_table_set *set) |
36 | { |
37 | return ¤t_user_ns()->set == set; |
38 | } |
39 | |
40 | static int set_permissions(struct ctl_table_header *head, |
41 | struct ctl_table *table) |
42 | { |
43 | struct user_namespace *user_ns = |
44 | container_of(head->set, struct user_namespace, set); |
45 | int mode; |
46 | |
47 | /* Allow users with CAP_SYS_RESOURCE unrestrained access */ |
48 | if (ns_capable(ns: user_ns, CAP_SYS_RESOURCE)) |
49 | mode = (table->mode & S_IRWXU) >> 6; |
50 | else |
51 | /* Allow all others at most read-only access */ |
52 | mode = table->mode & S_IROTH; |
53 | return (mode << 6) | (mode << 3) | mode; |
54 | } |
55 | |
56 | static struct ctl_table_root set_root = { |
57 | .lookup = set_lookup, |
58 | .permissions = set_permissions, |
59 | }; |
60 | |
61 | static long ue_zero = 0; |
62 | static long ue_int_max = INT_MAX; |
63 | |
64 | #define UCOUNT_ENTRY(name) \ |
65 | { \ |
66 | .procname = name, \ |
67 | .maxlen = sizeof(long), \ |
68 | .mode = 0644, \ |
69 | .proc_handler = proc_doulongvec_minmax, \ |
70 | .extra1 = &ue_zero, \ |
71 | .extra2 = &ue_int_max, \ |
72 | } |
73 | static struct ctl_table user_table[] = { |
74 | UCOUNT_ENTRY("max_user_namespaces" ), |
75 | UCOUNT_ENTRY("max_pid_namespaces" ), |
76 | UCOUNT_ENTRY("max_uts_namespaces" ), |
77 | UCOUNT_ENTRY("max_ipc_namespaces" ), |
78 | UCOUNT_ENTRY("max_net_namespaces" ), |
79 | UCOUNT_ENTRY("max_mnt_namespaces" ), |
80 | UCOUNT_ENTRY("max_cgroup_namespaces" ), |
81 | UCOUNT_ENTRY("max_time_namespaces" ), |
82 | #ifdef CONFIG_INOTIFY_USER |
83 | UCOUNT_ENTRY("max_inotify_instances" ), |
84 | UCOUNT_ENTRY("max_inotify_watches" ), |
85 | #endif |
86 | #ifdef CONFIG_FANOTIFY |
87 | UCOUNT_ENTRY("max_fanotify_groups" ), |
88 | UCOUNT_ENTRY("max_fanotify_marks" ), |
89 | #endif |
90 | { } |
91 | }; |
92 | #endif /* CONFIG_SYSCTL */ |
93 | |
94 | bool setup_userns_sysctls(struct user_namespace *ns) |
95 | { |
96 | #ifdef CONFIG_SYSCTL |
97 | struct ctl_table *tbl; |
98 | |
99 | BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS + 1); |
100 | setup_sysctl_set(p: &ns->set, root: &set_root, is_seen: set_is_seen); |
101 | tbl = kmemdup(p: user_table, size: sizeof(user_table), GFP_KERNEL); |
102 | if (tbl) { |
103 | int i; |
104 | for (i = 0; i < UCOUNT_COUNTS; i++) { |
105 | tbl[i].data = &ns->ucount_max[i]; |
106 | } |
107 | ns->sysctls = __register_sysctl_table(set: &ns->set, path: "user" , table: tbl, |
108 | ARRAY_SIZE(user_table)); |
109 | } |
110 | if (!ns->sysctls) { |
111 | kfree(objp: tbl); |
112 | retire_sysctl_set(set: &ns->set); |
113 | return false; |
114 | } |
115 | #endif |
116 | return true; |
117 | } |
118 | |
119 | void retire_userns_sysctls(struct user_namespace *ns) |
120 | { |
121 | #ifdef CONFIG_SYSCTL |
122 | struct ctl_table *tbl; |
123 | |
124 | tbl = ns->sysctls->ctl_table_arg; |
125 | unregister_sysctl_table(table: ns->sysctls); |
126 | retire_sysctl_set(set: &ns->set); |
127 | kfree(objp: tbl); |
128 | #endif |
129 | } |
130 | |
131 | static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent) |
132 | { |
133 | struct ucounts *ucounts; |
134 | |
135 | hlist_for_each_entry(ucounts, hashent, node) { |
136 | if (uid_eq(left: ucounts->uid, right: uid) && (ucounts->ns == ns)) |
137 | return ucounts; |
138 | } |
139 | return NULL; |
140 | } |
141 | |
142 | static void hlist_add_ucounts(struct ucounts *ucounts) |
143 | { |
144 | struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); |
145 | spin_lock_irq(lock: &ucounts_lock); |
146 | hlist_add_head(n: &ucounts->node, h: hashent); |
147 | spin_unlock_irq(lock: &ucounts_lock); |
148 | } |
149 | |
150 | static inline bool get_ucounts_or_wrap(struct ucounts *ucounts) |
151 | { |
152 | /* Returns true on a successful get, false if the count wraps. */ |
153 | return !atomic_add_negative(i: 1, v: &ucounts->count); |
154 | } |
155 | |
156 | struct ucounts *get_ucounts(struct ucounts *ucounts) |
157 | { |
158 | if (!get_ucounts_or_wrap(ucounts)) { |
159 | put_ucounts(ucounts); |
160 | ucounts = NULL; |
161 | } |
162 | return ucounts; |
163 | } |
164 | |
165 | struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) |
166 | { |
167 | struct hlist_head *hashent = ucounts_hashentry(ns, uid); |
168 | struct ucounts *ucounts, *new; |
169 | bool wrapped; |
170 | |
171 | spin_lock_irq(lock: &ucounts_lock); |
172 | ucounts = find_ucounts(ns, uid, hashent); |
173 | if (!ucounts) { |
174 | spin_unlock_irq(lock: &ucounts_lock); |
175 | |
176 | new = kzalloc(size: sizeof(*new), GFP_KERNEL); |
177 | if (!new) |
178 | return NULL; |
179 | |
180 | new->ns = ns; |
181 | new->uid = uid; |
182 | atomic_set(v: &new->count, i: 1); |
183 | |
184 | spin_lock_irq(lock: &ucounts_lock); |
185 | ucounts = find_ucounts(ns, uid, hashent); |
186 | if (ucounts) { |
187 | kfree(objp: new); |
188 | } else { |
189 | hlist_add_head(n: &new->node, h: hashent); |
190 | get_user_ns(ns: new->ns); |
191 | spin_unlock_irq(lock: &ucounts_lock); |
192 | return new; |
193 | } |
194 | } |
195 | wrapped = !get_ucounts_or_wrap(ucounts); |
196 | spin_unlock_irq(lock: &ucounts_lock); |
197 | if (wrapped) { |
198 | put_ucounts(ucounts); |
199 | return NULL; |
200 | } |
201 | return ucounts; |
202 | } |
203 | |
204 | void put_ucounts(struct ucounts *ucounts) |
205 | { |
206 | unsigned long flags; |
207 | |
208 | if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) { |
209 | hlist_del_init(n: &ucounts->node); |
210 | spin_unlock_irqrestore(lock: &ucounts_lock, flags); |
211 | put_user_ns(ns: ucounts->ns); |
212 | kfree(objp: ucounts); |
213 | } |
214 | } |
215 | |
216 | static inline bool atomic_long_inc_below(atomic_long_t *v, int u) |
217 | { |
218 | long c, old; |
219 | c = atomic_long_read(v); |
220 | for (;;) { |
221 | if (unlikely(c >= u)) |
222 | return false; |
223 | old = atomic_long_cmpxchg(v, old: c, new: c+1); |
224 | if (likely(old == c)) |
225 | return true; |
226 | c = old; |
227 | } |
228 | } |
229 | |
230 | struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, |
231 | enum ucount_type type) |
232 | { |
233 | struct ucounts *ucounts, *iter, *bad; |
234 | struct user_namespace *tns; |
235 | ucounts = alloc_ucounts(ns, uid); |
236 | for (iter = ucounts; iter; iter = tns->ucounts) { |
237 | long max; |
238 | tns = iter->ns; |
239 | max = READ_ONCE(tns->ucount_max[type]); |
240 | if (!atomic_long_inc_below(v: &iter->ucount[type], u: max)) |
241 | goto fail; |
242 | } |
243 | return ucounts; |
244 | fail: |
245 | bad = iter; |
246 | for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) |
247 | atomic_long_dec(v: &iter->ucount[type]); |
248 | |
249 | put_ucounts(ucounts); |
250 | return NULL; |
251 | } |
252 | |
253 | void dec_ucount(struct ucounts *ucounts, enum ucount_type type) |
254 | { |
255 | struct ucounts *iter; |
256 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { |
257 | long dec = atomic_long_dec_if_positive(v: &iter->ucount[type]); |
258 | WARN_ON_ONCE(dec < 0); |
259 | } |
260 | put_ucounts(ucounts); |
261 | } |
262 | |
263 | long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) |
264 | { |
265 | struct ucounts *iter; |
266 | long max = LONG_MAX; |
267 | long ret = 0; |
268 | |
269 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { |
270 | long new = atomic_long_add_return(i: v, v: &iter->rlimit[type]); |
271 | if (new < 0 || new > max) |
272 | ret = LONG_MAX; |
273 | else if (iter == ucounts) |
274 | ret = new; |
275 | max = get_userns_rlimit_max(ns: iter->ns, type); |
276 | } |
277 | return ret; |
278 | } |
279 | |
280 | bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) |
281 | { |
282 | struct ucounts *iter; |
283 | long new = -1; /* Silence compiler warning */ |
284 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { |
285 | long dec = atomic_long_sub_return(i: v, v: &iter->rlimit[type]); |
286 | WARN_ON_ONCE(dec < 0); |
287 | if (iter == ucounts) |
288 | new = dec; |
289 | } |
290 | return (new == 0); |
291 | } |
292 | |
293 | static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, |
294 | struct ucounts *last, enum rlimit_type type) |
295 | { |
296 | struct ucounts *iter, *next; |
297 | for (iter = ucounts; iter != last; iter = next) { |
298 | long dec = atomic_long_sub_return(i: 1, v: &iter->rlimit[type]); |
299 | WARN_ON_ONCE(dec < 0); |
300 | next = iter->ns->ucounts; |
301 | if (dec == 0) |
302 | put_ucounts(ucounts: iter); |
303 | } |
304 | } |
305 | |
306 | void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) |
307 | { |
308 | do_dec_rlimit_put_ucounts(ucounts, NULL, type); |
309 | } |
310 | |
311 | long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) |
312 | { |
313 | /* Caller must hold a reference to ucounts */ |
314 | struct ucounts *iter; |
315 | long max = LONG_MAX; |
316 | long dec, ret = 0; |
317 | |
318 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { |
319 | long new = atomic_long_add_return(i: 1, v: &iter->rlimit[type]); |
320 | if (new < 0 || new > max) |
321 | goto unwind; |
322 | if (iter == ucounts) |
323 | ret = new; |
324 | max = get_userns_rlimit_max(ns: iter->ns, type); |
325 | /* |
326 | * Grab an extra ucount reference for the caller when |
327 | * the rlimit count was previously 0. |
328 | */ |
329 | if (new != 1) |
330 | continue; |
331 | if (!get_ucounts(ucounts: iter)) |
332 | goto dec_unwind; |
333 | } |
334 | return ret; |
335 | dec_unwind: |
336 | dec = atomic_long_sub_return(i: 1, v: &iter->rlimit[type]); |
337 | WARN_ON_ONCE(dec < 0); |
338 | unwind: |
339 | do_dec_rlimit_put_ucounts(ucounts, last: iter, type); |
340 | return 0; |
341 | } |
342 | |
343 | bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) |
344 | { |
345 | struct ucounts *iter; |
346 | long max = rlimit; |
347 | if (rlimit > LONG_MAX) |
348 | max = LONG_MAX; |
349 | for (iter = ucounts; iter; iter = iter->ns->ucounts) { |
350 | long val = get_rlimit_value(ucounts: iter, type); |
351 | if (val < 0 || val > max) |
352 | return true; |
353 | max = get_userns_rlimit_max(ns: iter->ns, type); |
354 | } |
355 | return false; |
356 | } |
357 | |
358 | static __init int user_namespace_sysctl_init(void) |
359 | { |
360 | #ifdef CONFIG_SYSCTL |
361 | static struct ctl_table_header *; |
362 | static struct ctl_table empty[1]; |
363 | /* |
364 | * It is necessary to register the user directory in the |
365 | * default set so that registrations in the child sets work |
366 | * properly. |
367 | */ |
368 | user_header = register_sysctl_sz(path: "user" , table: empty, table_size: 0); |
369 | kmemleak_ignore(ptr: user_header); |
370 | BUG_ON(!user_header); |
371 | BUG_ON(!setup_userns_sysctls(&init_user_ns)); |
372 | #endif |
373 | hlist_add_ucounts(ucounts: &init_ucounts); |
374 | inc_rlimit_ucounts(ucounts: &init_ucounts, type: UCOUNT_RLIMIT_NPROC, v: 1); |
375 | return 0; |
376 | } |
377 | subsys_initcall(user_namespace_sysctl_init); |
378 | |