1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Routines having to do with the 'struct sk_buff' memory handlers. |
4 | * |
5 | * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> |
6 | * Florian La Roche <rzsfl@rz.uni-sb.de> |
7 | * |
8 | * Fixes: |
9 | * Alan Cox : Fixed the worst of the load |
10 | * balancer bugs. |
11 | * Dave Platt : Interrupt stacking fix. |
12 | * Richard Kooijman : Timestamp fixes. |
13 | * Alan Cox : Changed buffer format. |
14 | * Alan Cox : destructor hook for AF_UNIX etc. |
15 | * Linus Torvalds : Better skb_clone. |
16 | * Alan Cox : Added skb_copy. |
17 | * Alan Cox : Added all the changed routines Linus |
18 | * only put in the headers |
19 | * Ray VanTassle : Fixed --skb->lock in free |
20 | * Alan Cox : skb_copy copy arp field |
21 | * Andi Kleen : slabified it. |
22 | * Robert Olsson : Removed skb_head_pool |
23 | * |
24 | * NOTE: |
25 | * The __skb_ routines should be called with interrupts |
26 | * disabled, or you better be *real* sure that the operation is atomic |
27 | * with respect to whatever list is being frobbed (e.g. via lock_sock() |
28 | * or via disabling bottom half handlers, etc). |
29 | */ |
30 | |
31 | /* |
32 | * The functions in this file will not compile correctly with gcc 2.4.x |
33 | */ |
34 | |
35 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
36 | |
37 | #include <linux/module.h> |
38 | #include <linux/types.h> |
39 | #include <linux/kernel.h> |
40 | #include <linux/mm.h> |
41 | #include <linux/interrupt.h> |
42 | #include <linux/in.h> |
43 | #include <linux/inet.h> |
44 | #include <linux/slab.h> |
45 | #include <linux/tcp.h> |
46 | #include <linux/udp.h> |
47 | #include <linux/sctp.h> |
48 | #include <linux/netdevice.h> |
49 | #ifdef CONFIG_NET_CLS_ACT |
50 | #include <net/pkt_sched.h> |
51 | #endif |
52 | #include <linux/string.h> |
53 | #include <linux/skbuff.h> |
54 | #include <linux/splice.h> |
55 | #include <linux/cache.h> |
56 | #include <linux/rtnetlink.h> |
57 | #include <linux/init.h> |
58 | #include <linux/scatterlist.h> |
59 | #include <linux/errqueue.h> |
60 | #include <linux/prefetch.h> |
61 | #include <linux/bitfield.h> |
62 | #include <linux/if_vlan.h> |
63 | #include <linux/mpls.h> |
64 | #include <linux/kcov.h> |
65 | #include <linux/iov_iter.h> |
66 | |
67 | #include <net/protocol.h> |
68 | #include <net/dst.h> |
69 | #include <net/sock.h> |
70 | #include <net/checksum.h> |
71 | #include <net/gso.h> |
72 | #include <net/hotdata.h> |
73 | #include <net/ip6_checksum.h> |
74 | #include <net/xfrm.h> |
75 | #include <net/mpls.h> |
76 | #include <net/mptcp.h> |
77 | #include <net/mctp.h> |
78 | #include <net/page_pool/helpers.h> |
79 | #include <net/dropreason.h> |
80 | |
81 | #include <linux/uaccess.h> |
82 | #include <trace/events/skb.h> |
83 | #include <linux/highmem.h> |
84 | #include <linux/capability.h> |
85 | #include <linux/user_namespace.h> |
86 | #include <linux/indirect_call_wrapper.h> |
87 | #include <linux/textsearch.h> |
88 | |
89 | #include "dev.h" |
90 | #include "sock_destructor.h" |
91 | |
92 | #ifdef CONFIG_SKB_EXTENSIONS |
93 | static struct kmem_cache *skbuff_ext_cache __ro_after_init; |
94 | #endif |
95 | |
96 | #define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) |
97 | |
98 | /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. |
99 | * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique |
100 | * size, and we can differentiate heads from skb_small_head_cache |
101 | * vs system slabs by looking at their size (skb_end_offset()). |
102 | */ |
103 | #define SKB_SMALL_HEAD_CACHE_SIZE \ |
104 | (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \ |
105 | (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \ |
106 | SKB_SMALL_HEAD_SIZE) |
107 | |
108 | #define SKB_SMALL_HEAD_HEADROOM \ |
109 | SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) |
110 | |
111 | int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; |
112 | EXPORT_SYMBOL(sysctl_max_skb_frags); |
113 | |
114 | /* kcm_write_msgs() relies on casting paged frags to bio_vec to use |
115 | * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the |
116 | * netmem is a page. |
117 | */ |
118 | static_assert(offsetof(struct bio_vec, bv_page) == |
119 | offsetof(skb_frag_t, netmem)); |
120 | static_assert(sizeof_field(struct bio_vec, bv_page) == |
121 | sizeof_field(skb_frag_t, netmem)); |
122 | |
123 | static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len)); |
124 | static_assert(sizeof_field(struct bio_vec, bv_len) == |
125 | sizeof_field(skb_frag_t, len)); |
126 | |
127 | static_assert(offsetof(struct bio_vec, bv_offset) == |
128 | offsetof(skb_frag_t, offset)); |
129 | static_assert(sizeof_field(struct bio_vec, bv_offset) == |
130 | sizeof_field(skb_frag_t, offset)); |
131 | |
132 | #undef FN |
133 | #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, |
134 | static const char * const drop_reasons[] = { |
135 | [SKB_CONSUMED] = "CONSUMED" , |
136 | DEFINE_DROP_REASON(FN, FN) |
137 | }; |
138 | |
139 | static const struct drop_reason_list drop_reasons_core = { |
140 | .reasons = drop_reasons, |
141 | .n_reasons = ARRAY_SIZE(drop_reasons), |
142 | }; |
143 | |
144 | const struct drop_reason_list __rcu * |
145 | drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = { |
146 | [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core), |
147 | }; |
148 | EXPORT_SYMBOL(drop_reasons_by_subsys); |
149 | |
150 | /** |
151 | * drop_reasons_register_subsys - register another drop reason subsystem |
152 | * @subsys: the subsystem to register, must not be the core |
153 | * @list: the list of drop reasons within the subsystem, must point to |
154 | * a statically initialized list |
155 | */ |
156 | void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys, |
157 | const struct drop_reason_list *list) |
158 | { |
159 | if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || |
160 | subsys >= ARRAY_SIZE(drop_reasons_by_subsys), |
161 | "invalid subsystem %d\n" , subsys)) |
162 | return; |
163 | |
164 | /* must point to statically allocated memory, so INIT is OK */ |
165 | RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list); |
166 | } |
167 | EXPORT_SYMBOL_GPL(drop_reasons_register_subsys); |
168 | |
169 | /** |
170 | * drop_reasons_unregister_subsys - unregister a drop reason subsystem |
171 | * @subsys: the subsystem to remove, must not be the core |
172 | * |
173 | * Note: This will synchronize_rcu() to ensure no users when it returns. |
174 | */ |
175 | void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys) |
176 | { |
177 | if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE || |
178 | subsys >= ARRAY_SIZE(drop_reasons_by_subsys), |
179 | "invalid subsystem %d\n" , subsys)) |
180 | return; |
181 | |
182 | RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL); |
183 | |
184 | synchronize_rcu(); |
185 | } |
186 | EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys); |
187 | |
188 | /** |
189 | * skb_panic - private function for out-of-line support |
190 | * @skb: buffer |
191 | * @sz: size |
192 | * @addr: address |
193 | * @msg: skb_over_panic or skb_under_panic |
194 | * |
195 | * Out-of-line support for skb_put() and skb_push(). |
196 | * Called via the wrapper skb_over_panic() or skb_under_panic(). |
197 | * Keep out of line to prevent kernel bloat. |
198 | * __builtin_return_address is not used because it is not always reliable. |
199 | */ |
200 | static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, |
201 | const char msg[]) |
202 | { |
203 | pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n" , |
204 | msg, addr, skb->len, sz, skb->head, skb->data, |
205 | (unsigned long)skb->tail, (unsigned long)skb->end, |
206 | skb->dev ? skb->dev->name : "<NULL>" ); |
207 | BUG(); |
208 | } |
209 | |
210 | static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr) |
211 | { |
212 | skb_panic(skb, sz, addr, msg: __func__); |
213 | } |
214 | |
215 | static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) |
216 | { |
217 | skb_panic(skb, sz, addr, msg: __func__); |
218 | } |
219 | |
220 | #define NAPI_SKB_CACHE_SIZE 64 |
221 | #define NAPI_SKB_CACHE_BULK 16 |
222 | #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) |
223 | |
224 | #if PAGE_SIZE == SZ_4K |
225 | |
226 | #define NAPI_HAS_SMALL_PAGE_FRAG 1 |
227 | #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) |
228 | |
229 | /* specialized page frag allocator using a single order 0 page |
230 | * and slicing it into 1K sized fragment. Constrained to systems |
231 | * with a very limited amount of 1K fragments fitting a single |
232 | * page - to avoid excessive truesize underestimation |
233 | */ |
234 | |
235 | struct page_frag_1k { |
236 | void *va; |
237 | u16 offset; |
238 | bool pfmemalloc; |
239 | }; |
240 | |
241 | static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) |
242 | { |
243 | struct page *page; |
244 | int offset; |
245 | |
246 | offset = nc->offset - SZ_1K; |
247 | if (likely(offset >= 0)) |
248 | goto use_frag; |
249 | |
250 | page = alloc_pages_node(NUMA_NO_NODE, gfp_mask: gfp, order: 0); |
251 | if (!page) |
252 | return NULL; |
253 | |
254 | nc->va = page_address(page); |
255 | nc->pfmemalloc = page_is_pfmemalloc(page); |
256 | offset = PAGE_SIZE - SZ_1K; |
257 | page_ref_add(page, nr: offset / SZ_1K); |
258 | |
259 | use_frag: |
260 | nc->offset = offset; |
261 | return nc->va + offset; |
262 | } |
263 | #else |
264 | |
265 | /* the small page is actually unused in this build; add dummy helpers |
266 | * to please the compiler and avoid later preprocessor's conditionals |
267 | */ |
268 | #define NAPI_HAS_SMALL_PAGE_FRAG 0 |
269 | #define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false |
270 | |
271 | struct page_frag_1k { |
272 | }; |
273 | |
274 | static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) |
275 | { |
276 | return NULL; |
277 | } |
278 | |
279 | #endif |
280 | |
281 | struct napi_alloc_cache { |
282 | struct page_frag_cache page; |
283 | struct page_frag_1k page_small; |
284 | unsigned int skb_count; |
285 | void *skb_cache[NAPI_SKB_CACHE_SIZE]; |
286 | }; |
287 | |
288 | static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); |
289 | static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); |
290 | |
291 | /* Double check that napi_get_frags() allocates skbs with |
292 | * skb->head being backed by slab, not a page fragment. |
293 | * This is to make sure bug fixed in 3226b158e67c |
294 | * ("net: avoid 32 x truesize under-estimation for tiny skbs") |
295 | * does not accidentally come back. |
296 | */ |
297 | void napi_get_frags_check(struct napi_struct *napi) |
298 | { |
299 | struct sk_buff *skb; |
300 | |
301 | local_bh_disable(); |
302 | skb = napi_get_frags(napi); |
303 | WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); |
304 | napi_free_frags(napi); |
305 | local_bh_enable(); |
306 | } |
307 | |
308 | void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) |
309 | { |
310 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
311 | |
312 | fragsz = SKB_DATA_ALIGN(fragsz); |
313 | |
314 | return __page_frag_alloc_align(nc: &nc->page, fragsz, GFP_ATOMIC, |
315 | align_mask); |
316 | } |
317 | EXPORT_SYMBOL(__napi_alloc_frag_align); |
318 | |
319 | void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) |
320 | { |
321 | void *data; |
322 | |
323 | fragsz = SKB_DATA_ALIGN(fragsz); |
324 | if (in_hardirq() || irqs_disabled()) { |
325 | struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); |
326 | |
327 | data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, |
328 | align_mask); |
329 | } else { |
330 | struct napi_alloc_cache *nc; |
331 | |
332 | local_bh_disable(); |
333 | nc = this_cpu_ptr(&napi_alloc_cache); |
334 | data = __page_frag_alloc_align(nc: &nc->page, fragsz, GFP_ATOMIC, |
335 | align_mask); |
336 | local_bh_enable(); |
337 | } |
338 | return data; |
339 | } |
340 | EXPORT_SYMBOL(__netdev_alloc_frag_align); |
341 | |
342 | static struct sk_buff *napi_skb_cache_get(void) |
343 | { |
344 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
345 | struct sk_buff *skb; |
346 | |
347 | if (unlikely(!nc->skb_count)) { |
348 | nc->skb_count = kmem_cache_alloc_bulk(s: net_hotdata.skbuff_cache, |
349 | GFP_ATOMIC, |
350 | NAPI_SKB_CACHE_BULK, |
351 | p: nc->skb_cache); |
352 | if (unlikely(!nc->skb_count)) |
353 | return NULL; |
354 | } |
355 | |
356 | skb = nc->skb_cache[--nc->skb_count]; |
357 | kasan_mempool_unpoison_object(ptr: skb, size: kmem_cache_size(s: net_hotdata.skbuff_cache)); |
358 | |
359 | return skb; |
360 | } |
361 | |
362 | static inline void __finalize_skb_around(struct sk_buff *skb, void *data, |
363 | unsigned int size) |
364 | { |
365 | struct skb_shared_info *shinfo; |
366 | |
367 | size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); |
368 | |
369 | /* Assumes caller memset cleared SKB */ |
370 | skb->truesize = SKB_TRUESIZE(size); |
371 | refcount_set(r: &skb->users, n: 1); |
372 | skb->head = data; |
373 | skb->data = data; |
374 | skb_reset_tail_pointer(skb); |
375 | skb_set_end_offset(skb, offset: size); |
376 | skb->mac_header = (typeof(skb->mac_header))~0U; |
377 | skb->transport_header = (typeof(skb->transport_header))~0U; |
378 | skb->alloc_cpu = raw_smp_processor_id(); |
379 | /* make sure we initialize shinfo sequentially */ |
380 | shinfo = skb_shinfo(skb); |
381 | memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); |
382 | atomic_set(v: &shinfo->dataref, i: 1); |
383 | |
384 | skb_set_kcov_handle(skb, kcov_handle: kcov_common_handle()); |
385 | } |
386 | |
387 | static inline void *__slab_build_skb(struct sk_buff *skb, void *data, |
388 | unsigned int *size) |
389 | { |
390 | void *resized; |
391 | |
392 | /* Must find the allocation size (and grow it to match). */ |
393 | *size = ksize(objp: data); |
394 | /* krealloc() will immediately return "data" when |
395 | * "ksize(data)" is requested: it is the existing upper |
396 | * bounds. As a result, GFP_ATOMIC will be ignored. Note |
397 | * that this "new" pointer needs to be passed back to the |
398 | * caller for use so the __alloc_size hinting will be |
399 | * tracked correctly. |
400 | */ |
401 | resized = krealloc(objp: data, new_size: *size, GFP_ATOMIC); |
402 | WARN_ON_ONCE(resized != data); |
403 | return resized; |
404 | } |
405 | |
406 | /* build_skb() variant which can operate on slab buffers. |
407 | * Note that this should be used sparingly as slab buffers |
408 | * cannot be combined efficiently by GRO! |
409 | */ |
410 | struct sk_buff *slab_build_skb(void *data) |
411 | { |
412 | struct sk_buff *skb; |
413 | unsigned int size; |
414 | |
415 | skb = kmem_cache_alloc(cachep: net_hotdata.skbuff_cache, GFP_ATOMIC); |
416 | if (unlikely(!skb)) |
417 | return NULL; |
418 | |
419 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
420 | data = __slab_build_skb(skb, data, size: &size); |
421 | __finalize_skb_around(skb, data, size); |
422 | |
423 | return skb; |
424 | } |
425 | EXPORT_SYMBOL(slab_build_skb); |
426 | |
427 | /* Caller must provide SKB that is memset cleared */ |
428 | static void __build_skb_around(struct sk_buff *skb, void *data, |
429 | unsigned int frag_size) |
430 | { |
431 | unsigned int size = frag_size; |
432 | |
433 | /* frag_size == 0 is considered deprecated now. Callers |
434 | * using slab buffer should use slab_build_skb() instead. |
435 | */ |
436 | if (WARN_ONCE(size == 0, "Use slab_build_skb() instead" )) |
437 | data = __slab_build_skb(skb, data, size: &size); |
438 | |
439 | __finalize_skb_around(skb, data, size); |
440 | } |
441 | |
442 | /** |
443 | * __build_skb - build a network buffer |
444 | * @data: data buffer provided by caller |
445 | * @frag_size: size of data (must not be 0) |
446 | * |
447 | * Allocate a new &sk_buff. Caller provides space holding head and |
448 | * skb_shared_info. @data must have been allocated from the page |
449 | * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() |
450 | * allocation is deprecated, and callers should use slab_build_skb() |
451 | * instead.) |
452 | * The return is the new skb buffer. |
453 | * On a failure the return is %NULL, and @data is not freed. |
454 | * Notes : |
455 | * Before IO, driver allocates only data buffer where NIC put incoming frame |
456 | * Driver should add room at head (NET_SKB_PAD) and |
457 | * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info)) |
458 | * After IO, driver calls build_skb(), to allocate sk_buff and populate it |
459 | * before giving packet to stack. |
460 | * RX rings only contains data buffers, not full skbs. |
461 | */ |
462 | struct sk_buff *__build_skb(void *data, unsigned int frag_size) |
463 | { |
464 | struct sk_buff *skb; |
465 | |
466 | skb = kmem_cache_alloc(cachep: net_hotdata.skbuff_cache, GFP_ATOMIC); |
467 | if (unlikely(!skb)) |
468 | return NULL; |
469 | |
470 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
471 | __build_skb_around(skb, data, frag_size); |
472 | |
473 | return skb; |
474 | } |
475 | |
476 | /* build_skb() is wrapper over __build_skb(), that specifically |
477 | * takes care of skb->head and skb->pfmemalloc |
478 | */ |
479 | struct sk_buff *build_skb(void *data, unsigned int frag_size) |
480 | { |
481 | struct sk_buff *skb = __build_skb(data, frag_size); |
482 | |
483 | if (likely(skb && frag_size)) { |
484 | skb->head_frag = 1; |
485 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
486 | } |
487 | return skb; |
488 | } |
489 | EXPORT_SYMBOL(build_skb); |
490 | |
491 | /** |
492 | * build_skb_around - build a network buffer around provided skb |
493 | * @skb: sk_buff provide by caller, must be memset cleared |
494 | * @data: data buffer provided by caller |
495 | * @frag_size: size of data |
496 | */ |
497 | struct sk_buff *build_skb_around(struct sk_buff *skb, |
498 | void *data, unsigned int frag_size) |
499 | { |
500 | if (unlikely(!skb)) |
501 | return NULL; |
502 | |
503 | __build_skb_around(skb, data, frag_size); |
504 | |
505 | if (frag_size) { |
506 | skb->head_frag = 1; |
507 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
508 | } |
509 | return skb; |
510 | } |
511 | EXPORT_SYMBOL(build_skb_around); |
512 | |
513 | /** |
514 | * __napi_build_skb - build a network buffer |
515 | * @data: data buffer provided by caller |
516 | * @frag_size: size of data |
517 | * |
518 | * Version of __build_skb() that uses NAPI percpu caches to obtain |
519 | * skbuff_head instead of inplace allocation. |
520 | * |
521 | * Returns a new &sk_buff on success, %NULL on allocation failure. |
522 | */ |
523 | static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) |
524 | { |
525 | struct sk_buff *skb; |
526 | |
527 | skb = napi_skb_cache_get(); |
528 | if (unlikely(!skb)) |
529 | return NULL; |
530 | |
531 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
532 | __build_skb_around(skb, data, frag_size); |
533 | |
534 | return skb; |
535 | } |
536 | |
537 | /** |
538 | * napi_build_skb - build a network buffer |
539 | * @data: data buffer provided by caller |
540 | * @frag_size: size of data |
541 | * |
542 | * Version of __napi_build_skb() that takes care of skb->head_frag |
543 | * and skb->pfmemalloc when the data is a page or page fragment. |
544 | * |
545 | * Returns a new &sk_buff on success, %NULL on allocation failure. |
546 | */ |
547 | struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) |
548 | { |
549 | struct sk_buff *skb = __napi_build_skb(data, frag_size); |
550 | |
551 | if (likely(skb) && frag_size) { |
552 | skb->head_frag = 1; |
553 | skb_propagate_pfmemalloc(page: virt_to_head_page(x: data), skb); |
554 | } |
555 | |
556 | return skb; |
557 | } |
558 | EXPORT_SYMBOL(napi_build_skb); |
559 | |
560 | /* |
561 | * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells |
562 | * the caller if emergency pfmemalloc reserves are being used. If it is and |
563 | * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves |
564 | * may be used. Otherwise, the packet data may be discarded until enough |
565 | * memory is free |
566 | */ |
567 | static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, |
568 | bool *pfmemalloc) |
569 | { |
570 | bool ret_pfmemalloc = false; |
571 | size_t obj_size; |
572 | void *obj; |
573 | |
574 | obj_size = SKB_HEAD_ALIGN(*size); |
575 | if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE && |
576 | !(flags & KMALLOC_NOT_NORMAL_BITS)) { |
577 | obj = kmem_cache_alloc_node(s: net_hotdata.skb_small_head_cache, |
578 | flags: flags | __GFP_NOMEMALLOC | __GFP_NOWARN, |
579 | node); |
580 | *size = SKB_SMALL_HEAD_CACHE_SIZE; |
581 | if (obj || !(gfp_pfmemalloc_allowed(gfp_mask: flags))) |
582 | goto out; |
583 | /* Try again but now we are using pfmemalloc reserves */ |
584 | ret_pfmemalloc = true; |
585 | obj = kmem_cache_alloc_node(s: net_hotdata.skb_small_head_cache, flags, node); |
586 | goto out; |
587 | } |
588 | |
589 | obj_size = kmalloc_size_roundup(size: obj_size); |
590 | /* The following cast might truncate high-order bits of obj_size, this |
591 | * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. |
592 | */ |
593 | *size = (unsigned int)obj_size; |
594 | |
595 | /* |
596 | * Try a regular allocation, when that fails and we're not entitled |
597 | * to the reserves, fail. |
598 | */ |
599 | obj = kmalloc_node_track_caller(obj_size, |
600 | flags | __GFP_NOMEMALLOC | __GFP_NOWARN, |
601 | node); |
602 | if (obj || !(gfp_pfmemalloc_allowed(gfp_mask: flags))) |
603 | goto out; |
604 | |
605 | /* Try again but now we are using pfmemalloc reserves */ |
606 | ret_pfmemalloc = true; |
607 | obj = kmalloc_node_track_caller(obj_size, flags, node); |
608 | |
609 | out: |
610 | if (pfmemalloc) |
611 | *pfmemalloc = ret_pfmemalloc; |
612 | |
613 | return obj; |
614 | } |
615 | |
616 | /* Allocate a new skbuff. We do this ourselves so we can fill in a few |
617 | * 'private' fields and also do memory statistics to find all the |
618 | * [BEEP] leaks. |
619 | * |
620 | */ |
621 | |
622 | /** |
623 | * __alloc_skb - allocate a network buffer |
624 | * @size: size to allocate |
625 | * @gfp_mask: allocation mask |
626 | * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache |
627 | * instead of head cache and allocate a cloned (child) skb. |
628 | * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for |
629 | * allocations in case the data is required for writeback |
630 | * @node: numa node to allocate memory on |
631 | * |
632 | * Allocate a new &sk_buff. The returned buffer has no headroom and a |
633 | * tail room of at least size bytes. The object has a reference count |
634 | * of one. The return is the buffer. On a failure the return is %NULL. |
635 | * |
636 | * Buffers may only be allocated from interrupts using a @gfp_mask of |
637 | * %GFP_ATOMIC. |
638 | */ |
639 | struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, |
640 | int flags, int node) |
641 | { |
642 | struct kmem_cache *cache; |
643 | struct sk_buff *skb; |
644 | bool pfmemalloc; |
645 | u8 *data; |
646 | |
647 | cache = (flags & SKB_ALLOC_FCLONE) |
648 | ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; |
649 | |
650 | if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) |
651 | gfp_mask |= __GFP_MEMALLOC; |
652 | |
653 | /* Get the HEAD */ |
654 | if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && |
655 | likely(node == NUMA_NO_NODE || node == numa_mem_id())) |
656 | skb = napi_skb_cache_get(); |
657 | else |
658 | skb = kmem_cache_alloc_node(s: cache, flags: gfp_mask & ~GFP_DMA, node); |
659 | if (unlikely(!skb)) |
660 | return NULL; |
661 | prefetchw(x: skb); |
662 | |
663 | /* We do our best to align skb_shared_info on a separate cache |
664 | * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives |
665 | * aligned memory blocks, unless SLUB/SLAB debug is enabled. |
666 | * Both skb->head and skb_shared_info are cache line aligned. |
667 | */ |
668 | data = kmalloc_reserve(size: &size, flags: gfp_mask, node, pfmemalloc: &pfmemalloc); |
669 | if (unlikely(!data)) |
670 | goto nodata; |
671 | /* kmalloc_size_roundup() might give us more room than requested. |
672 | * Put skb_shared_info exactly at the end of allocated zone, |
673 | * to allow max possible filling before reallocation. |
674 | */ |
675 | prefetchw(x: data + SKB_WITH_OVERHEAD(size)); |
676 | |
677 | /* |
678 | * Only clear those fields we need to clear, not those that we will |
679 | * actually initialise below. Hence, don't put any more fields after |
680 | * the tail pointer in struct sk_buff! |
681 | */ |
682 | memset(skb, 0, offsetof(struct sk_buff, tail)); |
683 | __build_skb_around(skb, data, frag_size: size); |
684 | skb->pfmemalloc = pfmemalloc; |
685 | |
686 | if (flags & SKB_ALLOC_FCLONE) { |
687 | struct sk_buff_fclones *fclones; |
688 | |
689 | fclones = container_of(skb, struct sk_buff_fclones, skb1); |
690 | |
691 | skb->fclone = SKB_FCLONE_ORIG; |
692 | refcount_set(r: &fclones->fclone_ref, n: 1); |
693 | } |
694 | |
695 | return skb; |
696 | |
697 | nodata: |
698 | kmem_cache_free(s: cache, objp: skb); |
699 | return NULL; |
700 | } |
701 | EXPORT_SYMBOL(__alloc_skb); |
702 | |
703 | /** |
704 | * __netdev_alloc_skb - allocate an skbuff for rx on a specific device |
705 | * @dev: network device to receive on |
706 | * @len: length to allocate |
707 | * @gfp_mask: get_free_pages mask, passed to alloc_skb |
708 | * |
709 | * Allocate a new &sk_buff and assign it a usage count of one. The |
710 | * buffer has NET_SKB_PAD headroom built in. Users should allocate |
711 | * the headroom they think they need without accounting for the |
712 | * built in space. The built in space is used for optimisations. |
713 | * |
714 | * %NULL is returned if there is no free memory. |
715 | */ |
716 | struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, |
717 | gfp_t gfp_mask) |
718 | { |
719 | struct page_frag_cache *nc; |
720 | struct sk_buff *skb; |
721 | bool pfmemalloc; |
722 | void *data; |
723 | |
724 | len += NET_SKB_PAD; |
725 | |
726 | /* If requested length is either too small or too big, |
727 | * we use kmalloc() for skb->head allocation. |
728 | */ |
729 | if (len <= SKB_WITH_OVERHEAD(1024) || |
730 | len > SKB_WITH_OVERHEAD(PAGE_SIZE) || |
731 | (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { |
732 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); |
733 | if (!skb) |
734 | goto skb_fail; |
735 | goto skb_success; |
736 | } |
737 | |
738 | len = SKB_HEAD_ALIGN(len); |
739 | |
740 | if (sk_memalloc_socks()) |
741 | gfp_mask |= __GFP_MEMALLOC; |
742 | |
743 | if (in_hardirq() || irqs_disabled()) { |
744 | nc = this_cpu_ptr(&netdev_alloc_cache); |
745 | data = page_frag_alloc(nc, fragsz: len, gfp_mask); |
746 | pfmemalloc = nc->pfmemalloc; |
747 | } else { |
748 | local_bh_disable(); |
749 | nc = this_cpu_ptr(&napi_alloc_cache.page); |
750 | data = page_frag_alloc(nc, fragsz: len, gfp_mask); |
751 | pfmemalloc = nc->pfmemalloc; |
752 | local_bh_enable(); |
753 | } |
754 | |
755 | if (unlikely(!data)) |
756 | return NULL; |
757 | |
758 | skb = __build_skb(data, frag_size: len); |
759 | if (unlikely(!skb)) { |
760 | skb_free_frag(addr: data); |
761 | return NULL; |
762 | } |
763 | |
764 | if (pfmemalloc) |
765 | skb->pfmemalloc = 1; |
766 | skb->head_frag = 1; |
767 | |
768 | skb_success: |
769 | skb_reserve(skb, NET_SKB_PAD); |
770 | skb->dev = dev; |
771 | |
772 | skb_fail: |
773 | return skb; |
774 | } |
775 | EXPORT_SYMBOL(__netdev_alloc_skb); |
776 | |
777 | /** |
778 | * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance |
779 | * @napi: napi instance this buffer was allocated for |
780 | * @len: length to allocate |
781 | * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages |
782 | * |
783 | * Allocate a new sk_buff for use in NAPI receive. This buffer will |
784 | * attempt to allocate the head from a special reserved region used |
785 | * only for NAPI Rx allocation. By doing this we can save several |
786 | * CPU cycles by avoiding having to disable and re-enable IRQs. |
787 | * |
788 | * %NULL is returned if there is no free memory. |
789 | */ |
790 | struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, |
791 | gfp_t gfp_mask) |
792 | { |
793 | struct napi_alloc_cache *nc; |
794 | struct sk_buff *skb; |
795 | bool pfmemalloc; |
796 | void *data; |
797 | |
798 | DEBUG_NET_WARN_ON_ONCE(!in_softirq()); |
799 | len += NET_SKB_PAD + NET_IP_ALIGN; |
800 | |
801 | /* If requested length is either too small or too big, |
802 | * we use kmalloc() for skb->head allocation. |
803 | * When the small frag allocator is available, prefer it over kmalloc |
804 | * for small fragments |
805 | */ |
806 | if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || |
807 | len > SKB_WITH_OVERHEAD(PAGE_SIZE) || |
808 | (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { |
809 | skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, |
810 | NUMA_NO_NODE); |
811 | if (!skb) |
812 | goto skb_fail; |
813 | goto skb_success; |
814 | } |
815 | |
816 | nc = this_cpu_ptr(&napi_alloc_cache); |
817 | |
818 | if (sk_memalloc_socks()) |
819 | gfp_mask |= __GFP_MEMALLOC; |
820 | |
821 | if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { |
822 | /* we are artificially inflating the allocation size, but |
823 | * that is not as bad as it may look like, as: |
824 | * - 'len' less than GRO_MAX_HEAD makes little sense |
825 | * - On most systems, larger 'len' values lead to fragment |
826 | * size above 512 bytes |
827 | * - kmalloc would use the kmalloc-1k slab for such values |
828 | * - Builds with smaller GRO_MAX_HEAD will very likely do |
829 | * little networking, as that implies no WiFi and no |
830 | * tunnels support, and 32 bits arches. |
831 | */ |
832 | len = SZ_1K; |
833 | |
834 | data = page_frag_alloc_1k(nc: &nc->page_small, gfp: gfp_mask); |
835 | pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); |
836 | } else { |
837 | len = SKB_HEAD_ALIGN(len); |
838 | |
839 | data = page_frag_alloc(nc: &nc->page, fragsz: len, gfp_mask); |
840 | pfmemalloc = nc->page.pfmemalloc; |
841 | } |
842 | |
843 | if (unlikely(!data)) |
844 | return NULL; |
845 | |
846 | skb = __napi_build_skb(data, frag_size: len); |
847 | if (unlikely(!skb)) { |
848 | skb_free_frag(addr: data); |
849 | return NULL; |
850 | } |
851 | |
852 | if (pfmemalloc) |
853 | skb->pfmemalloc = 1; |
854 | skb->head_frag = 1; |
855 | |
856 | skb_success: |
857 | skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); |
858 | skb->dev = napi->dev; |
859 | |
860 | skb_fail: |
861 | return skb; |
862 | } |
863 | EXPORT_SYMBOL(__napi_alloc_skb); |
864 | |
865 | void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, |
866 | int off, int size, unsigned int truesize) |
867 | { |
868 | DEBUG_NET_WARN_ON_ONCE(size > truesize); |
869 | |
870 | skb_fill_netmem_desc(skb, i, netmem, off, size); |
871 | skb->len += size; |
872 | skb->data_len += size; |
873 | skb->truesize += truesize; |
874 | } |
875 | EXPORT_SYMBOL(skb_add_rx_frag_netmem); |
876 | |
877 | void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, |
878 | unsigned int truesize) |
879 | { |
880 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
881 | |
882 | DEBUG_NET_WARN_ON_ONCE(size > truesize); |
883 | |
884 | skb_frag_size_add(frag, delta: size); |
885 | skb->len += size; |
886 | skb->data_len += size; |
887 | skb->truesize += truesize; |
888 | } |
889 | EXPORT_SYMBOL(skb_coalesce_rx_frag); |
890 | |
891 | static void skb_drop_list(struct sk_buff **listp) |
892 | { |
893 | kfree_skb_list(segs: *listp); |
894 | *listp = NULL; |
895 | } |
896 | |
897 | static inline void skb_drop_fraglist(struct sk_buff *skb) |
898 | { |
899 | skb_drop_list(listp: &skb_shinfo(skb)->frag_list); |
900 | } |
901 | |
902 | static void skb_clone_fraglist(struct sk_buff *skb) |
903 | { |
904 | struct sk_buff *list; |
905 | |
906 | skb_walk_frags(skb, list) |
907 | skb_get(skb: list); |
908 | } |
909 | |
910 | static bool is_pp_page(struct page *page) |
911 | { |
912 | return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; |
913 | } |
914 | |
915 | int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, |
916 | unsigned int headroom) |
917 | { |
918 | #if IS_ENABLED(CONFIG_PAGE_POOL) |
919 | u32 size, truesize, len, max_head_size, off; |
920 | struct sk_buff *skb = *pskb, *nskb; |
921 | int err, i, head_off; |
922 | void *data; |
923 | |
924 | /* XDP does not support fraglist so we need to linearize |
925 | * the skb. |
926 | */ |
927 | if (skb_has_frag_list(skb)) |
928 | return -EOPNOTSUPP; |
929 | |
930 | max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); |
931 | if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) |
932 | return -ENOMEM; |
933 | |
934 | size = min_t(u32, skb->len, max_head_size); |
935 | truesize = SKB_HEAD_ALIGN(size) + headroom; |
936 | data = page_pool_dev_alloc_va(pool, size: &truesize); |
937 | if (!data) |
938 | return -ENOMEM; |
939 | |
940 | nskb = napi_build_skb(data, truesize); |
941 | if (!nskb) { |
942 | page_pool_free_va(pool, va: data, allow_direct: true); |
943 | return -ENOMEM; |
944 | } |
945 | |
946 | skb_reserve(skb: nskb, len: headroom); |
947 | skb_copy_header(new: nskb, old: skb); |
948 | skb_mark_for_recycle(skb: nskb); |
949 | |
950 | err = skb_copy_bits(skb, offset: 0, to: nskb->data, len: size); |
951 | if (err) { |
952 | consume_skb(skb: nskb); |
953 | return err; |
954 | } |
955 | skb_put(skb: nskb, len: size); |
956 | |
957 | head_off = skb_headroom(skb: nskb) - skb_headroom(skb); |
958 | skb_headers_offset_update(skb: nskb, off: head_off); |
959 | |
960 | off = size; |
961 | len = skb->len - off; |
962 | for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { |
963 | struct page *page; |
964 | u32 page_off; |
965 | |
966 | size = min_t(u32, len, PAGE_SIZE); |
967 | truesize = size; |
968 | |
969 | page = page_pool_dev_alloc(pool, offset: &page_off, size: &truesize); |
970 | if (!page) { |
971 | consume_skb(skb: nskb); |
972 | return -ENOMEM; |
973 | } |
974 | |
975 | skb_add_rx_frag(skb: nskb, i, page, off: page_off, size, truesize); |
976 | err = skb_copy_bits(skb, offset: off, page_address(page) + page_off, |
977 | len: size); |
978 | if (err) { |
979 | consume_skb(skb: nskb); |
980 | return err; |
981 | } |
982 | |
983 | len -= size; |
984 | off += size; |
985 | } |
986 | |
987 | consume_skb(skb); |
988 | *pskb = nskb; |
989 | |
990 | return 0; |
991 | #else |
992 | return -EOPNOTSUPP; |
993 | #endif |
994 | } |
995 | EXPORT_SYMBOL(skb_pp_cow_data); |
996 | |
997 | int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, |
998 | struct bpf_prog *prog) |
999 | { |
1000 | if (!prog->aux->xdp_has_frags) |
1001 | return -EINVAL; |
1002 | |
1003 | return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); |
1004 | } |
1005 | EXPORT_SYMBOL(skb_cow_data_for_xdp); |
1006 | |
1007 | #if IS_ENABLED(CONFIG_PAGE_POOL) |
1008 | bool napi_pp_put_page(struct page *page, bool napi_safe) |
1009 | { |
1010 | bool allow_direct = false; |
1011 | struct page_pool *pp; |
1012 | |
1013 | page = compound_head(page); |
1014 | |
1015 | /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation |
1016 | * in order to preserve any existing bits, such as bit 0 for the |
1017 | * head page of compound page and bit 1 for pfmemalloc page, so |
1018 | * mask those bits for freeing side when doing below checking, |
1019 | * and page_is_pfmemalloc() is checked in __page_pool_put_page() |
1020 | * to avoid recycling the pfmemalloc page. |
1021 | */ |
1022 | if (unlikely(!is_pp_page(page))) |
1023 | return false; |
1024 | |
1025 | pp = page->pp; |
1026 | |
1027 | /* Allow direct recycle if we have reasons to believe that we are |
1028 | * in the same context as the consumer would run, so there's |
1029 | * no possible race. |
1030 | * __page_pool_put_page() makes sure we're not in hardirq context |
1031 | * and interrupts are enabled prior to accessing the cache. |
1032 | */ |
1033 | if (napi_safe || in_softirq()) { |
1034 | const struct napi_struct *napi = READ_ONCE(pp->p.napi); |
1035 | unsigned int cpuid = smp_processor_id(); |
1036 | |
1037 | allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; |
1038 | allow_direct |= READ_ONCE(pp->cpuid) == cpuid; |
1039 | } |
1040 | |
1041 | /* Driver set this to memory recycling info. Reset it on recycle. |
1042 | * This will *not* work for NIC using a split-page memory model. |
1043 | * The page will be returned to the pool here regardless of the |
1044 | * 'flipped' fragment being in use or not. |
1045 | */ |
1046 | page_pool_put_full_page(pool: pp, page, allow_direct); |
1047 | |
1048 | return true; |
1049 | } |
1050 | EXPORT_SYMBOL(napi_pp_put_page); |
1051 | #endif |
1052 | |
1053 | static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) |
1054 | { |
1055 | if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) |
1056 | return false; |
1057 | return napi_pp_put_page(virt_to_page(data), napi_safe); |
1058 | } |
1059 | |
1060 | /** |
1061 | * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb |
1062 | * @skb: page pool aware skb |
1063 | * |
1064 | * Increase the fragment reference count (pp_ref_count) of a skb. This is |
1065 | * intended to gain fragment references only for page pool aware skbs, |
1066 | * i.e. when skb->pp_recycle is true, and not for fragments in a |
1067 | * non-pp-recycling skb. It has a fallback to increase references on normal |
1068 | * pages, as page pool aware skbs may also have normal page fragments. |
1069 | */ |
1070 | static int skb_pp_frag_ref(struct sk_buff *skb) |
1071 | { |
1072 | struct skb_shared_info *shinfo; |
1073 | struct page *head_page; |
1074 | int i; |
1075 | |
1076 | if (!skb->pp_recycle) |
1077 | return -EINVAL; |
1078 | |
1079 | shinfo = skb_shinfo(skb); |
1080 | |
1081 | for (i = 0; i < shinfo->nr_frags; i++) { |
1082 | head_page = compound_head(skb_frag_page(&shinfo->frags[i])); |
1083 | if (likely(is_pp_page(head_page))) |
1084 | page_pool_ref_page(page: head_page); |
1085 | else |
1086 | page_ref_inc(page: head_page); |
1087 | } |
1088 | return 0; |
1089 | } |
1090 | |
1091 | static void skb_kfree_head(void *head, unsigned int end_offset) |
1092 | { |
1093 | if (end_offset == SKB_SMALL_HEAD_HEADROOM) |
1094 | kmem_cache_free(s: net_hotdata.skb_small_head_cache, objp: head); |
1095 | else |
1096 | kfree(objp: head); |
1097 | } |
1098 | |
1099 | static void skb_free_head(struct sk_buff *skb, bool napi_safe) |
1100 | { |
1101 | unsigned char *head = skb->head; |
1102 | |
1103 | if (skb->head_frag) { |
1104 | if (skb_pp_recycle(skb, data: head, napi_safe)) |
1105 | return; |
1106 | skb_free_frag(addr: head); |
1107 | } else { |
1108 | skb_kfree_head(head, end_offset: skb_end_offset(skb)); |
1109 | } |
1110 | } |
1111 | |
1112 | static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason, |
1113 | bool napi_safe) |
1114 | { |
1115 | struct skb_shared_info *shinfo = skb_shinfo(skb); |
1116 | int i; |
1117 | |
1118 | if (!skb_data_unref(skb, shinfo)) |
1119 | goto exit; |
1120 | |
1121 | if (skb_zcopy(skb)) { |
1122 | bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS; |
1123 | |
1124 | skb_zcopy_clear(skb, zerocopy_success: true); |
1125 | if (skip_unref) |
1126 | goto free_head; |
1127 | } |
1128 | |
1129 | for (i = 0; i < shinfo->nr_frags; i++) |
1130 | napi_frag_unref(frag: &shinfo->frags[i], recycle: skb->pp_recycle, napi_safe); |
1131 | |
1132 | free_head: |
1133 | if (shinfo->frag_list) |
1134 | kfree_skb_list_reason(segs: shinfo->frag_list, reason); |
1135 | |
1136 | skb_free_head(skb, napi_safe); |
1137 | exit: |
1138 | /* When we clone an SKB we copy the reycling bit. The pp_recycle |
1139 | * bit is only set on the head though, so in order to avoid races |
1140 | * while trying to recycle fragments on __skb_frag_unref() we need |
1141 | * to make one SKB responsible for triggering the recycle path. |
1142 | * So disable the recycling bit if an SKB is cloned and we have |
1143 | * additional references to the fragmented part of the SKB. |
1144 | * Eventually the last SKB will have the recycling bit set and it's |
1145 | * dataref set to 0, which will trigger the recycling |
1146 | */ |
1147 | skb->pp_recycle = 0; |
1148 | } |
1149 | |
1150 | /* |
1151 | * Free an skbuff by memory without cleaning the state. |
1152 | */ |
1153 | static void kfree_skbmem(struct sk_buff *skb) |
1154 | { |
1155 | struct sk_buff_fclones *fclones; |
1156 | |
1157 | switch (skb->fclone) { |
1158 | case SKB_FCLONE_UNAVAILABLE: |
1159 | kmem_cache_free(s: net_hotdata.skbuff_cache, objp: skb); |
1160 | return; |
1161 | |
1162 | case SKB_FCLONE_ORIG: |
1163 | fclones = container_of(skb, struct sk_buff_fclones, skb1); |
1164 | |
1165 | /* We usually free the clone (TX completion) before original skb |
1166 | * This test would have no chance to be true for the clone, |
1167 | * while here, branch prediction will be good. |
1168 | */ |
1169 | if (refcount_read(r: &fclones->fclone_ref) == 1) |
1170 | goto fastpath; |
1171 | break; |
1172 | |
1173 | default: /* SKB_FCLONE_CLONE */ |
1174 | fclones = container_of(skb, struct sk_buff_fclones, skb2); |
1175 | break; |
1176 | } |
1177 | if (!refcount_dec_and_test(r: &fclones->fclone_ref)) |
1178 | return; |
1179 | fastpath: |
1180 | kmem_cache_free(s: net_hotdata.skbuff_fclone_cache, objp: fclones); |
1181 | } |
1182 | |
1183 | void skb_release_head_state(struct sk_buff *skb) |
1184 | { |
1185 | skb_dst_drop(skb); |
1186 | if (skb->destructor) { |
1187 | DEBUG_NET_WARN_ON_ONCE(in_hardirq()); |
1188 | skb->destructor(skb); |
1189 | } |
1190 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
1191 | nf_conntrack_put(nfct: skb_nfct(skb)); |
1192 | #endif |
1193 | skb_ext_put(skb); |
1194 | } |
1195 | |
1196 | /* Free everything but the sk_buff shell. */ |
1197 | static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason, |
1198 | bool napi_safe) |
1199 | { |
1200 | skb_release_head_state(skb); |
1201 | if (likely(skb->head)) |
1202 | skb_release_data(skb, reason, napi_safe); |
1203 | } |
1204 | |
1205 | /** |
1206 | * __kfree_skb - private function |
1207 | * @skb: buffer |
1208 | * |
1209 | * Free an sk_buff. Release anything attached to the buffer. |
1210 | * Clean the state. This is an internal helper function. Users should |
1211 | * always call kfree_skb |
1212 | */ |
1213 | |
1214 | void __kfree_skb(struct sk_buff *skb) |
1215 | { |
1216 | skb_release_all(skb, reason: SKB_DROP_REASON_NOT_SPECIFIED, napi_safe: false); |
1217 | kfree_skbmem(skb); |
1218 | } |
1219 | EXPORT_SYMBOL(__kfree_skb); |
1220 | |
1221 | static __always_inline |
1222 | bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) |
1223 | { |
1224 | if (unlikely(!skb_unref(skb))) |
1225 | return false; |
1226 | |
1227 | DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET || |
1228 | u32_get_bits(reason, |
1229 | SKB_DROP_REASON_SUBSYS_MASK) >= |
1230 | SKB_DROP_REASON_SUBSYS_NUM); |
1231 | |
1232 | if (reason == SKB_CONSUMED) |
1233 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1234 | else |
1235 | trace_kfree_skb(skb, location: __builtin_return_address(0), reason); |
1236 | return true; |
1237 | } |
1238 | |
1239 | /** |
1240 | * kfree_skb_reason - free an sk_buff with special reason |
1241 | * @skb: buffer to free |
1242 | * @reason: reason why this skb is dropped |
1243 | * |
1244 | * Drop a reference to the buffer and free it if the usage count has |
1245 | * hit zero. Meanwhile, pass the drop reason to 'kfree_skb' |
1246 | * tracepoint. |
1247 | */ |
1248 | void __fix_address |
1249 | kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) |
1250 | { |
1251 | if (__kfree_skb_reason(skb, reason)) |
1252 | __kfree_skb(skb); |
1253 | } |
1254 | EXPORT_SYMBOL(kfree_skb_reason); |
1255 | |
1256 | #define KFREE_SKB_BULK_SIZE 16 |
1257 | |
1258 | struct skb_free_array { |
1259 | unsigned int skb_count; |
1260 | void *skb_array[KFREE_SKB_BULK_SIZE]; |
1261 | }; |
1262 | |
1263 | static void kfree_skb_add_bulk(struct sk_buff *skb, |
1264 | struct skb_free_array *sa, |
1265 | enum skb_drop_reason reason) |
1266 | { |
1267 | /* if SKB is a clone, don't handle this case */ |
1268 | if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) { |
1269 | __kfree_skb(skb); |
1270 | return; |
1271 | } |
1272 | |
1273 | skb_release_all(skb, reason, napi_safe: false); |
1274 | sa->skb_array[sa->skb_count++] = skb; |
1275 | |
1276 | if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) { |
1277 | kmem_cache_free_bulk(s: net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE, |
1278 | p: sa->skb_array); |
1279 | sa->skb_count = 0; |
1280 | } |
1281 | } |
1282 | |
1283 | void __fix_address |
1284 | kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason) |
1285 | { |
1286 | struct skb_free_array sa; |
1287 | |
1288 | sa.skb_count = 0; |
1289 | |
1290 | while (segs) { |
1291 | struct sk_buff *next = segs->next; |
1292 | |
1293 | if (__kfree_skb_reason(skb: segs, reason)) { |
1294 | skb_poison_list(skb: segs); |
1295 | kfree_skb_add_bulk(skb: segs, sa: &sa, reason); |
1296 | } |
1297 | |
1298 | segs = next; |
1299 | } |
1300 | |
1301 | if (sa.skb_count) |
1302 | kmem_cache_free_bulk(s: net_hotdata.skbuff_cache, size: sa.skb_count, p: sa.skb_array); |
1303 | } |
1304 | EXPORT_SYMBOL(kfree_skb_list_reason); |
1305 | |
1306 | /* Dump skb information and contents. |
1307 | * |
1308 | * Must only be called from net_ratelimit()-ed paths. |
1309 | * |
1310 | * Dumps whole packets if full_pkt, only headers otherwise. |
1311 | */ |
1312 | void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt) |
1313 | { |
1314 | struct skb_shared_info *sh = skb_shinfo(skb); |
1315 | struct net_device *dev = skb->dev; |
1316 | struct sock *sk = skb->sk; |
1317 | struct sk_buff *list_skb; |
1318 | bool has_mac, has_trans; |
1319 | int headroom, tailroom; |
1320 | int i, len, seg_len; |
1321 | |
1322 | if (full_pkt) |
1323 | len = skb->len; |
1324 | else |
1325 | len = min_t(int, skb->len, MAX_HEADER + 128); |
1326 | |
1327 | headroom = skb_headroom(skb); |
1328 | tailroom = skb_tailroom(skb); |
1329 | |
1330 | has_mac = skb_mac_header_was_set(skb); |
1331 | has_trans = skb_transport_header_was_set(skb); |
1332 | |
1333 | printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n" |
1334 | "mac=(%d,%d) net=(%d,%d) trans=%d\n" |
1335 | "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n" |
1336 | "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n" |
1337 | "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n" , |
1338 | level, skb->len, headroom, skb_headlen(skb), tailroom, |
1339 | has_mac ? skb->mac_header : -1, |
1340 | has_mac ? skb_mac_header_len(skb) : -1, |
1341 | skb->network_header, |
1342 | has_trans ? skb_network_header_len(skb) : -1, |
1343 | has_trans ? skb->transport_header : -1, |
1344 | sh->tx_flags, sh->nr_frags, |
1345 | sh->gso_size, sh->gso_type, sh->gso_segs, |
1346 | skb->csum, skb->ip_summed, skb->csum_complete_sw, |
1347 | skb->csum_valid, skb->csum_level, |
1348 | skb->hash, skb->sw_hash, skb->l4_hash, |
1349 | ntohs(skb->protocol), skb->pkt_type, skb->skb_iif); |
1350 | |
1351 | if (dev) |
1352 | printk("%sdev name=%s feat=%pNF\n" , |
1353 | level, dev->name, &dev->features); |
1354 | if (sk) |
1355 | printk("%ssk family=%hu type=%u proto=%u\n" , |
1356 | level, sk->sk_family, sk->sk_type, sk->sk_protocol); |
1357 | |
1358 | if (full_pkt && headroom) |
1359 | print_hex_dump(level, prefix_str: "skb headroom: " , prefix_type: DUMP_PREFIX_OFFSET, |
1360 | rowsize: 16, groupsize: 1, buf: skb->head, len: headroom, ascii: false); |
1361 | |
1362 | seg_len = min_t(int, skb_headlen(skb), len); |
1363 | if (seg_len) |
1364 | print_hex_dump(level, prefix_str: "skb linear: " , prefix_type: DUMP_PREFIX_OFFSET, |
1365 | rowsize: 16, groupsize: 1, buf: skb->data, len: seg_len, ascii: false); |
1366 | len -= seg_len; |
1367 | |
1368 | if (full_pkt && tailroom) |
1369 | print_hex_dump(level, prefix_str: "skb tailroom: " , prefix_type: DUMP_PREFIX_OFFSET, |
1370 | rowsize: 16, groupsize: 1, buf: skb_tail_pointer(skb), len: tailroom, ascii: false); |
1371 | |
1372 | for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) { |
1373 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
1374 | u32 p_off, p_len, copied; |
1375 | struct page *p; |
1376 | u8 *vaddr; |
1377 | |
1378 | skb_frag_foreach_page(frag, skb_frag_off(frag), |
1379 | skb_frag_size(frag), p, p_off, p_len, |
1380 | copied) { |
1381 | seg_len = min_t(int, p_len, len); |
1382 | vaddr = kmap_atomic(page: p); |
1383 | print_hex_dump(level, prefix_str: "skb frag: " , |
1384 | prefix_type: DUMP_PREFIX_OFFSET, |
1385 | rowsize: 16, groupsize: 1, buf: vaddr + p_off, len: seg_len, ascii: false); |
1386 | kunmap_atomic(vaddr); |
1387 | len -= seg_len; |
1388 | if (!len) |
1389 | break; |
1390 | } |
1391 | } |
1392 | |
1393 | if (full_pkt && skb_has_frag_list(skb)) { |
1394 | printk("skb fraglist:\n" ); |
1395 | skb_walk_frags(skb, list_skb) |
1396 | skb_dump(level, skb: list_skb, full_pkt: true); |
1397 | } |
1398 | } |
1399 | EXPORT_SYMBOL(skb_dump); |
1400 | |
1401 | /** |
1402 | * skb_tx_error - report an sk_buff xmit error |
1403 | * @skb: buffer that triggered an error |
1404 | * |
1405 | * Report xmit error if a device callback is tracking this skb. |
1406 | * skb must be freed afterwards. |
1407 | */ |
1408 | void skb_tx_error(struct sk_buff *skb) |
1409 | { |
1410 | if (skb) { |
1411 | skb_zcopy_downgrade_managed(skb); |
1412 | skb_zcopy_clear(skb, zerocopy_success: true); |
1413 | } |
1414 | } |
1415 | EXPORT_SYMBOL(skb_tx_error); |
1416 | |
1417 | #ifdef CONFIG_TRACEPOINTS |
1418 | /** |
1419 | * consume_skb - free an skbuff |
1420 | * @skb: buffer to free |
1421 | * |
1422 | * Drop a ref to the buffer and free it if the usage count has hit zero |
1423 | * Functions identically to kfree_skb, but kfree_skb assumes that the frame |
1424 | * is being dropped after a failure and notes that |
1425 | */ |
1426 | void consume_skb(struct sk_buff *skb) |
1427 | { |
1428 | if (!skb_unref(skb)) |
1429 | return; |
1430 | |
1431 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1432 | __kfree_skb(skb); |
1433 | } |
1434 | EXPORT_SYMBOL(consume_skb); |
1435 | #endif |
1436 | |
1437 | /** |
1438 | * __consume_stateless_skb - free an skbuff, assuming it is stateless |
1439 | * @skb: buffer to free |
1440 | * |
1441 | * Alike consume_skb(), but this variant assumes that this is the last |
1442 | * skb reference and all the head states have been already dropped |
1443 | */ |
1444 | void __consume_stateless_skb(struct sk_buff *skb) |
1445 | { |
1446 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1447 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
1448 | kfree_skbmem(skb); |
1449 | } |
1450 | |
1451 | static void napi_skb_cache_put(struct sk_buff *skb) |
1452 | { |
1453 | struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); |
1454 | u32 i; |
1455 | |
1456 | if (!kasan_mempool_poison_object(ptr: skb)) |
1457 | return; |
1458 | |
1459 | nc->skb_cache[nc->skb_count++] = skb; |
1460 | |
1461 | if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { |
1462 | for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) |
1463 | kasan_mempool_unpoison_object(ptr: nc->skb_cache[i], |
1464 | size: kmem_cache_size(s: net_hotdata.skbuff_cache)); |
1465 | |
1466 | kmem_cache_free_bulk(s: net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, |
1467 | p: nc->skb_cache + NAPI_SKB_CACHE_HALF); |
1468 | nc->skb_count = NAPI_SKB_CACHE_HALF; |
1469 | } |
1470 | } |
1471 | |
1472 | void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason) |
1473 | { |
1474 | skb_release_all(skb, reason, napi_safe: true); |
1475 | napi_skb_cache_put(skb); |
1476 | } |
1477 | |
1478 | void napi_skb_free_stolen_head(struct sk_buff *skb) |
1479 | { |
1480 | if (unlikely(skb->slow_gro)) { |
1481 | nf_reset_ct(skb); |
1482 | skb_dst_drop(skb); |
1483 | skb_ext_put(skb); |
1484 | skb_orphan(skb); |
1485 | skb->slow_gro = 0; |
1486 | } |
1487 | napi_skb_cache_put(skb); |
1488 | } |
1489 | |
1490 | void napi_consume_skb(struct sk_buff *skb, int budget) |
1491 | { |
1492 | /* Zero budget indicate non-NAPI context called us, like netpoll */ |
1493 | if (unlikely(!budget)) { |
1494 | dev_consume_skb_any(skb); |
1495 | return; |
1496 | } |
1497 | |
1498 | DEBUG_NET_WARN_ON_ONCE(!in_softirq()); |
1499 | |
1500 | if (!skb_unref(skb)) |
1501 | return; |
1502 | |
1503 | /* if reaching here SKB is ready to free */ |
1504 | trace_consume_skb(skb, location: __builtin_return_address(0)); |
1505 | |
1506 | /* if SKB is a clone, don't handle this case */ |
1507 | if (skb->fclone != SKB_FCLONE_UNAVAILABLE) { |
1508 | __kfree_skb(skb); |
1509 | return; |
1510 | } |
1511 | |
1512 | skb_release_all(skb, reason: SKB_CONSUMED, napi_safe: !!budget); |
1513 | napi_skb_cache_put(skb); |
1514 | } |
1515 | EXPORT_SYMBOL(napi_consume_skb); |
1516 | |
1517 | /* Make sure a field is contained by headers group */ |
1518 | #define CHECK_SKB_FIELD(field) \ |
1519 | BUILD_BUG_ON(offsetof(struct sk_buff, field) != \ |
1520 | offsetof(struct sk_buff, headers.field)); \ |
1521 | |
1522 | static void (struct sk_buff *new, const struct sk_buff *old) |
1523 | { |
1524 | new->tstamp = old->tstamp; |
1525 | /* We do not copy old->sk */ |
1526 | new->dev = old->dev; |
1527 | memcpy(new->cb, old->cb, sizeof(old->cb)); |
1528 | skb_dst_copy(nskb: new, oskb: old); |
1529 | __skb_ext_copy(dst: new, src: old); |
1530 | __nf_copy(dst: new, src: old, copy: false); |
1531 | |
1532 | /* Note : this field could be in the headers group. |
1533 | * It is not yet because we do not want to have a 16 bit hole |
1534 | */ |
1535 | new->queue_mapping = old->queue_mapping; |
1536 | |
1537 | memcpy(&new->headers, &old->headers, sizeof(new->headers)); |
1538 | CHECK_SKB_FIELD(protocol); |
1539 | CHECK_SKB_FIELD(csum); |
1540 | CHECK_SKB_FIELD(hash); |
1541 | CHECK_SKB_FIELD(priority); |
1542 | CHECK_SKB_FIELD(skb_iif); |
1543 | CHECK_SKB_FIELD(vlan_proto); |
1544 | CHECK_SKB_FIELD(vlan_tci); |
1545 | CHECK_SKB_FIELD(transport_header); |
1546 | CHECK_SKB_FIELD(network_header); |
1547 | CHECK_SKB_FIELD(mac_header); |
1548 | CHECK_SKB_FIELD(inner_protocol); |
1549 | CHECK_SKB_FIELD(inner_transport_header); |
1550 | CHECK_SKB_FIELD(inner_network_header); |
1551 | CHECK_SKB_FIELD(inner_mac_header); |
1552 | CHECK_SKB_FIELD(mark); |
1553 | #ifdef CONFIG_NETWORK_SECMARK |
1554 | CHECK_SKB_FIELD(secmark); |
1555 | #endif |
1556 | #ifdef CONFIG_NET_RX_BUSY_POLL |
1557 | CHECK_SKB_FIELD(napi_id); |
1558 | #endif |
1559 | CHECK_SKB_FIELD(alloc_cpu); |
1560 | #ifdef CONFIG_XPS |
1561 | CHECK_SKB_FIELD(sender_cpu); |
1562 | #endif |
1563 | #ifdef CONFIG_NET_SCHED |
1564 | CHECK_SKB_FIELD(tc_index); |
1565 | #endif |
1566 | |
1567 | } |
1568 | |
1569 | /* |
1570 | * You should not add any new code to this function. Add it to |
1571 | * __copy_skb_header above instead. |
1572 | */ |
1573 | static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) |
1574 | { |
1575 | #define C(x) n->x = skb->x |
1576 | |
1577 | n->next = n->prev = NULL; |
1578 | n->sk = NULL; |
1579 | __copy_skb_header(new: n, old: skb); |
1580 | |
1581 | C(len); |
1582 | C(data_len); |
1583 | C(mac_len); |
1584 | n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; |
1585 | n->cloned = 1; |
1586 | n->nohdr = 0; |
1587 | n->peeked = 0; |
1588 | C(pfmemalloc); |
1589 | C(pp_recycle); |
1590 | n->destructor = NULL; |
1591 | C(tail); |
1592 | C(end); |
1593 | C(head); |
1594 | C(head_frag); |
1595 | C(data); |
1596 | C(truesize); |
1597 | refcount_set(r: &n->users, n: 1); |
1598 | |
1599 | atomic_inc(v: &(skb_shinfo(skb)->dataref)); |
1600 | skb->cloned = 1; |
1601 | |
1602 | return n; |
1603 | #undef C |
1604 | } |
1605 | |
1606 | /** |
1607 | * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg |
1608 | * @first: first sk_buff of the msg |
1609 | */ |
1610 | struct sk_buff *alloc_skb_for_msg(struct sk_buff *first) |
1611 | { |
1612 | struct sk_buff *n; |
1613 | |
1614 | n = alloc_skb(size: 0, GFP_ATOMIC); |
1615 | if (!n) |
1616 | return NULL; |
1617 | |
1618 | n->len = first->len; |
1619 | n->data_len = first->len; |
1620 | n->truesize = first->truesize; |
1621 | |
1622 | skb_shinfo(n)->frag_list = first; |
1623 | |
1624 | __copy_skb_header(new: n, old: first); |
1625 | n->destructor = NULL; |
1626 | |
1627 | return n; |
1628 | } |
1629 | EXPORT_SYMBOL_GPL(alloc_skb_for_msg); |
1630 | |
1631 | /** |
1632 | * skb_morph - morph one skb into another |
1633 | * @dst: the skb to receive the contents |
1634 | * @src: the skb to supply the contents |
1635 | * |
1636 | * This is identical to skb_clone except that the target skb is |
1637 | * supplied by the user. |
1638 | * |
1639 | * The target skb is returned upon exit. |
1640 | */ |
1641 | struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) |
1642 | { |
1643 | skb_release_all(skb: dst, reason: SKB_CONSUMED, napi_safe: false); |
1644 | return __skb_clone(n: dst, skb: src); |
1645 | } |
1646 | EXPORT_SYMBOL_GPL(skb_morph); |
1647 | |
1648 | int mm_account_pinned_pages(struct mmpin *mmp, size_t size) |
1649 | { |
1650 | unsigned long max_pg, num_pg, new_pg, old_pg, rlim; |
1651 | struct user_struct *user; |
1652 | |
1653 | if (capable(CAP_IPC_LOCK) || !size) |
1654 | return 0; |
1655 | |
1656 | rlim = rlimit(RLIMIT_MEMLOCK); |
1657 | if (rlim == RLIM_INFINITY) |
1658 | return 0; |
1659 | |
1660 | num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */ |
1661 | max_pg = rlim >> PAGE_SHIFT; |
1662 | user = mmp->user ? : current_user(); |
1663 | |
1664 | old_pg = atomic_long_read(v: &user->locked_vm); |
1665 | do { |
1666 | new_pg = old_pg + num_pg; |
1667 | if (new_pg > max_pg) |
1668 | return -ENOBUFS; |
1669 | } while (!atomic_long_try_cmpxchg(v: &user->locked_vm, old: &old_pg, new: new_pg)); |
1670 | |
1671 | if (!mmp->user) { |
1672 | mmp->user = get_uid(u: user); |
1673 | mmp->num_pg = num_pg; |
1674 | } else { |
1675 | mmp->num_pg += num_pg; |
1676 | } |
1677 | |
1678 | return 0; |
1679 | } |
1680 | EXPORT_SYMBOL_GPL(mm_account_pinned_pages); |
1681 | |
1682 | void mm_unaccount_pinned_pages(struct mmpin *mmp) |
1683 | { |
1684 | if (mmp->user) { |
1685 | atomic_long_sub(i: mmp->num_pg, v: &mmp->user->locked_vm); |
1686 | free_uid(mmp->user); |
1687 | } |
1688 | } |
1689 | EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); |
1690 | |
1691 | static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) |
1692 | { |
1693 | struct ubuf_info_msgzc *uarg; |
1694 | struct sk_buff *skb; |
1695 | |
1696 | WARN_ON_ONCE(!in_task()); |
1697 | |
1698 | skb = sock_omalloc(sk, size: 0, GFP_KERNEL); |
1699 | if (!skb) |
1700 | return NULL; |
1701 | |
1702 | BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); |
1703 | uarg = (void *)skb->cb; |
1704 | uarg->mmp.user = NULL; |
1705 | |
1706 | if (mm_account_pinned_pages(&uarg->mmp, size)) { |
1707 | kfree_skb(skb); |
1708 | return NULL; |
1709 | } |
1710 | |
1711 | uarg->ubuf.callback = msg_zerocopy_callback; |
1712 | uarg->id = ((u32)atomic_inc_return(v: &sk->sk_zckey)) - 1; |
1713 | uarg->len = 1; |
1714 | uarg->bytelen = size; |
1715 | uarg->zerocopy = 1; |
1716 | uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; |
1717 | refcount_set(r: &uarg->ubuf.refcnt, n: 1); |
1718 | sock_hold(sk); |
1719 | |
1720 | return &uarg->ubuf; |
1721 | } |
1722 | |
1723 | static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) |
1724 | { |
1725 | return container_of((void *)uarg, struct sk_buff, cb); |
1726 | } |
1727 | |
1728 | struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, |
1729 | struct ubuf_info *uarg) |
1730 | { |
1731 | if (uarg) { |
1732 | struct ubuf_info_msgzc *uarg_zc; |
1733 | const u32 byte_limit = 1 << 19; /* limit to a few TSO */ |
1734 | u32 bytelen, next; |
1735 | |
1736 | /* there might be non MSG_ZEROCOPY users */ |
1737 | if (uarg->callback != msg_zerocopy_callback) |
1738 | return NULL; |
1739 | |
1740 | /* realloc only when socket is locked (TCP, UDP cork), |
1741 | * so uarg->len and sk_zckey access is serialized |
1742 | */ |
1743 | if (!sock_owned_by_user(sk)) { |
1744 | WARN_ON_ONCE(1); |
1745 | return NULL; |
1746 | } |
1747 | |
1748 | uarg_zc = uarg_to_msgzc(uarg); |
1749 | bytelen = uarg_zc->bytelen + size; |
1750 | if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { |
1751 | /* TCP can create new skb to attach new uarg */ |
1752 | if (sk->sk_type == SOCK_STREAM) |
1753 | goto new_alloc; |
1754 | return NULL; |
1755 | } |
1756 | |
1757 | next = (u32)atomic_read(v: &sk->sk_zckey); |
1758 | if ((u32)(uarg_zc->id + uarg_zc->len) == next) { |
1759 | if (mm_account_pinned_pages(&uarg_zc->mmp, size)) |
1760 | return NULL; |
1761 | uarg_zc->len++; |
1762 | uarg_zc->bytelen = bytelen; |
1763 | atomic_set(v: &sk->sk_zckey, i: ++next); |
1764 | |
1765 | /* no extra ref when appending to datagram (MSG_MORE) */ |
1766 | if (sk->sk_type == SOCK_STREAM) |
1767 | net_zcopy_get(uarg); |
1768 | |
1769 | return uarg; |
1770 | } |
1771 | } |
1772 | |
1773 | new_alloc: |
1774 | return msg_zerocopy_alloc(sk, size); |
1775 | } |
1776 | EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); |
1777 | |
1778 | static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) |
1779 | { |
1780 | struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); |
1781 | u32 old_lo, old_hi; |
1782 | u64 sum_len; |
1783 | |
1784 | old_lo = serr->ee.ee_info; |
1785 | old_hi = serr->ee.ee_data; |
1786 | sum_len = old_hi - old_lo + 1ULL + len; |
1787 | |
1788 | if (sum_len >= (1ULL << 32)) |
1789 | return false; |
1790 | |
1791 | if (lo != old_hi + 1) |
1792 | return false; |
1793 | |
1794 | serr->ee.ee_data += len; |
1795 | return true; |
1796 | } |
1797 | |
1798 | static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) |
1799 | { |
1800 | struct sk_buff *tail, *skb = skb_from_uarg(uarg); |
1801 | struct sock_exterr_skb *serr; |
1802 | struct sock *sk = skb->sk; |
1803 | struct sk_buff_head *q; |
1804 | unsigned long flags; |
1805 | bool is_zerocopy; |
1806 | u32 lo, hi; |
1807 | u16 len; |
1808 | |
1809 | mm_unaccount_pinned_pages(&uarg->mmp); |
1810 | |
1811 | /* if !len, there was only 1 call, and it was aborted |
1812 | * so do not queue a completion notification |
1813 | */ |
1814 | if (!uarg->len || sock_flag(sk, flag: SOCK_DEAD)) |
1815 | goto release; |
1816 | |
1817 | len = uarg->len; |
1818 | lo = uarg->id; |
1819 | hi = uarg->id + len - 1; |
1820 | is_zerocopy = uarg->zerocopy; |
1821 | |
1822 | serr = SKB_EXT_ERR(skb); |
1823 | memset(serr, 0, sizeof(*serr)); |
1824 | serr->ee.ee_errno = 0; |
1825 | serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; |
1826 | serr->ee.ee_data = hi; |
1827 | serr->ee.ee_info = lo; |
1828 | if (!is_zerocopy) |
1829 | serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; |
1830 | |
1831 | q = &sk->sk_error_queue; |
1832 | spin_lock_irqsave(&q->lock, flags); |
1833 | tail = skb_peek_tail(list_: q); |
1834 | if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY || |
1835 | !skb_zerocopy_notify_extend(skb: tail, lo, len)) { |
1836 | __skb_queue_tail(list: q, newsk: skb); |
1837 | skb = NULL; |
1838 | } |
1839 | spin_unlock_irqrestore(lock: &q->lock, flags); |
1840 | |
1841 | sk_error_report(sk); |
1842 | |
1843 | release: |
1844 | consume_skb(skb); |
1845 | sock_put(sk); |
1846 | } |
1847 | |
1848 | void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, |
1849 | bool success) |
1850 | { |
1851 | struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); |
1852 | |
1853 | uarg_zc->zerocopy = uarg_zc->zerocopy & success; |
1854 | |
1855 | if (refcount_dec_and_test(r: &uarg->refcnt)) |
1856 | __msg_zerocopy_callback(uarg: uarg_zc); |
1857 | } |
1858 | EXPORT_SYMBOL_GPL(msg_zerocopy_callback); |
1859 | |
1860 | void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) |
1861 | { |
1862 | struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; |
1863 | |
1864 | atomic_dec(v: &sk->sk_zckey); |
1865 | uarg_to_msgzc(uarg)->len--; |
1866 | |
1867 | if (have_uref) |
1868 | msg_zerocopy_callback(NULL, uarg, true); |
1869 | } |
1870 | EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); |
1871 | |
1872 | int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, |
1873 | struct msghdr *msg, int len, |
1874 | struct ubuf_info *uarg) |
1875 | { |
1876 | struct ubuf_info *orig_uarg = skb_zcopy(skb); |
1877 | int err, orig_len = skb->len; |
1878 | |
1879 | /* An skb can only point to one uarg. This edge case happens when |
1880 | * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. |
1881 | */ |
1882 | if (orig_uarg && uarg != orig_uarg) |
1883 | return -EEXIST; |
1884 | |
1885 | err = __zerocopy_sg_from_iter(msg, sk, skb, from: &msg->msg_iter, length: len); |
1886 | if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { |
1887 | struct sock *save_sk = skb->sk; |
1888 | |
1889 | /* Streams do not free skb on error. Reset to prev state. */ |
1890 | iov_iter_revert(i: &msg->msg_iter, bytes: skb->len - orig_len); |
1891 | skb->sk = sk; |
1892 | ___pskb_trim(skb, len: orig_len); |
1893 | skb->sk = save_sk; |
1894 | return err; |
1895 | } |
1896 | |
1897 | skb_zcopy_set(skb, uarg, NULL); |
1898 | return skb->len - orig_len; |
1899 | } |
1900 | EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); |
1901 | |
1902 | void __skb_zcopy_downgrade_managed(struct sk_buff *skb) |
1903 | { |
1904 | int i; |
1905 | |
1906 | skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS; |
1907 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
1908 | skb_frag_ref(skb, f: i); |
1909 | } |
1910 | EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed); |
1911 | |
1912 | static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, |
1913 | gfp_t gfp_mask) |
1914 | { |
1915 | if (skb_zcopy(skb: orig)) { |
1916 | if (skb_zcopy(skb: nskb)) { |
1917 | /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ |
1918 | if (!gfp_mask) { |
1919 | WARN_ON_ONCE(1); |
1920 | return -ENOMEM; |
1921 | } |
1922 | if (skb_uarg(nskb) == skb_uarg(orig)) |
1923 | return 0; |
1924 | if (skb_copy_ubufs(skb: nskb, GFP_ATOMIC)) |
1925 | return -EIO; |
1926 | } |
1927 | skb_zcopy_set(skb: nskb, skb_uarg(orig), NULL); |
1928 | } |
1929 | return 0; |
1930 | } |
1931 | |
1932 | /** |
1933 | * skb_copy_ubufs - copy userspace skb frags buffers to kernel |
1934 | * @skb: the skb to modify |
1935 | * @gfp_mask: allocation priority |
1936 | * |
1937 | * This must be called on skb with SKBFL_ZEROCOPY_ENABLE. |
1938 | * It will copy all frags into kernel and drop the reference |
1939 | * to userspace pages. |
1940 | * |
1941 | * If this function is called from an interrupt gfp_mask() must be |
1942 | * %GFP_ATOMIC. |
1943 | * |
1944 | * Returns 0 on success or a negative error code on failure |
1945 | * to allocate kernel memory to copy to. |
1946 | */ |
1947 | int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) |
1948 | { |
1949 | int num_frags = skb_shinfo(skb)->nr_frags; |
1950 | struct page *page, *head = NULL; |
1951 | int i, order, psize, new_frags; |
1952 | u32 d_off; |
1953 | |
1954 | if (skb_shared(skb) || skb_unclone(skb, pri: gfp_mask)) |
1955 | return -EINVAL; |
1956 | |
1957 | if (!num_frags) |
1958 | goto release; |
1959 | |
1960 | /* We might have to allocate high order pages, so compute what minimum |
1961 | * page order is needed. |
1962 | */ |
1963 | order = 0; |
1964 | while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb)) |
1965 | order++; |
1966 | psize = (PAGE_SIZE << order); |
1967 | |
1968 | new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order); |
1969 | for (i = 0; i < new_frags; i++) { |
1970 | page = alloc_pages(gfp: gfp_mask | __GFP_COMP, order); |
1971 | if (!page) { |
1972 | while (head) { |
1973 | struct page *next = (struct page *)page_private(head); |
1974 | put_page(page: head); |
1975 | head = next; |
1976 | } |
1977 | return -ENOMEM; |
1978 | } |
1979 | set_page_private(page, private: (unsigned long)head); |
1980 | head = page; |
1981 | } |
1982 | |
1983 | page = head; |
1984 | d_off = 0; |
1985 | for (i = 0; i < num_frags; i++) { |
1986 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; |
1987 | u32 p_off, p_len, copied; |
1988 | struct page *p; |
1989 | u8 *vaddr; |
1990 | |
1991 | skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), |
1992 | p, p_off, p_len, copied) { |
1993 | u32 copy, done = 0; |
1994 | vaddr = kmap_atomic(page: p); |
1995 | |
1996 | while (done < p_len) { |
1997 | if (d_off == psize) { |
1998 | d_off = 0; |
1999 | page = (struct page *)page_private(page); |
2000 | } |
2001 | copy = min_t(u32, psize - d_off, p_len - done); |
2002 | memcpy(page_address(page) + d_off, |
2003 | vaddr + p_off + done, copy); |
2004 | done += copy; |
2005 | d_off += copy; |
2006 | } |
2007 | kunmap_atomic(vaddr); |
2008 | } |
2009 | } |
2010 | |
2011 | /* skb frags release userspace buffers */ |
2012 | for (i = 0; i < num_frags; i++) |
2013 | skb_frag_unref(skb, f: i); |
2014 | |
2015 | /* skb frags point to kernel buffers */ |
2016 | for (i = 0; i < new_frags - 1; i++) { |
2017 | __skb_fill_netmem_desc(skb, i, netmem: page_to_netmem(page: head), off: 0, size: psize); |
2018 | head = (struct page *)page_private(head); |
2019 | } |
2020 | __skb_fill_netmem_desc(skb, i: new_frags - 1, netmem: page_to_netmem(page: head), off: 0, |
2021 | size: d_off); |
2022 | skb_shinfo(skb)->nr_frags = new_frags; |
2023 | |
2024 | release: |
2025 | skb_zcopy_clear(skb, zerocopy_success: false); |
2026 | return 0; |
2027 | } |
2028 | EXPORT_SYMBOL_GPL(skb_copy_ubufs); |
2029 | |
2030 | /** |
2031 | * skb_clone - duplicate an sk_buff |
2032 | * @skb: buffer to clone |
2033 | * @gfp_mask: allocation priority |
2034 | * |
2035 | * Duplicate an &sk_buff. The new one is not owned by a socket. Both |
2036 | * copies share the same packet data but not structure. The new |
2037 | * buffer has a reference count of 1. If the allocation fails the |
2038 | * function returns %NULL otherwise the new buffer is returned. |
2039 | * |
2040 | * If this function is called from an interrupt gfp_mask() must be |
2041 | * %GFP_ATOMIC. |
2042 | */ |
2043 | |
2044 | struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) |
2045 | { |
2046 | struct sk_buff_fclones *fclones = container_of(skb, |
2047 | struct sk_buff_fclones, |
2048 | skb1); |
2049 | struct sk_buff *n; |
2050 | |
2051 | if (skb_orphan_frags(skb, gfp_mask)) |
2052 | return NULL; |
2053 | |
2054 | if (skb->fclone == SKB_FCLONE_ORIG && |
2055 | refcount_read(r: &fclones->fclone_ref) == 1) { |
2056 | n = &fclones->skb2; |
2057 | refcount_set(r: &fclones->fclone_ref, n: 2); |
2058 | n->fclone = SKB_FCLONE_CLONE; |
2059 | } else { |
2060 | if (skb_pfmemalloc(skb)) |
2061 | gfp_mask |= __GFP_MEMALLOC; |
2062 | |
2063 | n = kmem_cache_alloc(cachep: net_hotdata.skbuff_cache, flags: gfp_mask); |
2064 | if (!n) |
2065 | return NULL; |
2066 | |
2067 | n->fclone = SKB_FCLONE_UNAVAILABLE; |
2068 | } |
2069 | |
2070 | return __skb_clone(n, skb); |
2071 | } |
2072 | EXPORT_SYMBOL(skb_clone); |
2073 | |
2074 | void (struct sk_buff *skb, int off) |
2075 | { |
2076 | /* Only adjust this if it actually is csum_start rather than csum */ |
2077 | if (skb->ip_summed == CHECKSUM_PARTIAL) |
2078 | skb->csum_start += off; |
2079 | /* {transport,network,mac}_header and tail are relative to skb->head */ |
2080 | skb->transport_header += off; |
2081 | skb->network_header += off; |
2082 | if (skb_mac_header_was_set(skb)) |
2083 | skb->mac_header += off; |
2084 | skb->inner_transport_header += off; |
2085 | skb->inner_network_header += off; |
2086 | skb->inner_mac_header += off; |
2087 | } |
2088 | EXPORT_SYMBOL(skb_headers_offset_update); |
2089 | |
2090 | void (struct sk_buff *new, const struct sk_buff *old) |
2091 | { |
2092 | __copy_skb_header(new, old); |
2093 | |
2094 | skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size; |
2095 | skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs; |
2096 | skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; |
2097 | } |
2098 | EXPORT_SYMBOL(skb_copy_header); |
2099 | |
2100 | static inline int skb_alloc_rx_flag(const struct sk_buff *skb) |
2101 | { |
2102 | if (skb_pfmemalloc(skb)) |
2103 | return SKB_ALLOC_RX; |
2104 | return 0; |
2105 | } |
2106 | |
2107 | /** |
2108 | * skb_copy - create private copy of an sk_buff |
2109 | * @skb: buffer to copy |
2110 | * @gfp_mask: allocation priority |
2111 | * |
2112 | * Make a copy of both an &sk_buff and its data. This is used when the |
2113 | * caller wishes to modify the data and needs a private copy of the |
2114 | * data to alter. Returns %NULL on failure or the pointer to the buffer |
2115 | * on success. The returned buffer has a reference count of 1. |
2116 | * |
2117 | * As by-product this function converts non-linear &sk_buff to linear |
2118 | * one, so that &sk_buff becomes completely private and caller is allowed |
2119 | * to modify all the data of returned buffer. This means that this |
2120 | * function is not recommended for use in circumstances when only |
2121 | * header is going to be modified. Use pskb_copy() instead. |
2122 | */ |
2123 | |
2124 | struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) |
2125 | { |
2126 | int = skb_headroom(skb); |
2127 | unsigned int size = skb_end_offset(skb) + skb->data_len; |
2128 | struct sk_buff *n = __alloc_skb(size, gfp_mask, |
2129 | skb_alloc_rx_flag(skb), NUMA_NO_NODE); |
2130 | |
2131 | if (!n) |
2132 | return NULL; |
2133 | |
2134 | /* Set the data pointer */ |
2135 | skb_reserve(skb: n, len: headerlen); |
2136 | /* Set the tail pointer and length */ |
2137 | skb_put(skb: n, len: skb->len); |
2138 | |
2139 | BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); |
2140 | |
2141 | skb_copy_header(n, skb); |
2142 | return n; |
2143 | } |
2144 | EXPORT_SYMBOL(skb_copy); |
2145 | |
2146 | /** |
2147 | * __pskb_copy_fclone - create copy of an sk_buff with private head. |
2148 | * @skb: buffer to copy |
2149 | * @headroom: headroom of new skb |
2150 | * @gfp_mask: allocation priority |
2151 | * @fclone: if true allocate the copy of the skb from the fclone |
2152 | * cache instead of the head cache; it is recommended to set this |
2153 | * to true for the cases where the copy will likely be cloned |
2154 | * |
2155 | * Make a copy of both an &sk_buff and part of its data, located |
2156 | * in header. Fragmented data remain shared. This is used when |
2157 | * the caller wishes to modify only header of &sk_buff and needs |
2158 | * private copy of the header to alter. Returns %NULL on failure |
2159 | * or the pointer to the buffer on success. |
2160 | * The returned buffer has a reference count of 1. |
2161 | */ |
2162 | |
2163 | struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, |
2164 | gfp_t gfp_mask, bool fclone) |
2165 | { |
2166 | unsigned int size = skb_headlen(skb) + headroom; |
2167 | int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0); |
2168 | struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE); |
2169 | |
2170 | if (!n) |
2171 | goto out; |
2172 | |
2173 | /* Set the data pointer */ |
2174 | skb_reserve(skb: n, len: headroom); |
2175 | /* Set the tail pointer and length */ |
2176 | skb_put(skb: n, len: skb_headlen(skb)); |
2177 | /* Copy the bytes */ |
2178 | skb_copy_from_linear_data(skb, to: n->data, len: n->len); |
2179 | |
2180 | n->truesize += skb->data_len; |
2181 | n->data_len = skb->data_len; |
2182 | n->len = skb->len; |
2183 | |
2184 | if (skb_shinfo(skb)->nr_frags) { |
2185 | int i; |
2186 | |
2187 | if (skb_orphan_frags(skb, gfp_mask) || |
2188 | skb_zerocopy_clone(nskb: n, orig: skb, gfp_mask)) { |
2189 | kfree_skb(skb: n); |
2190 | n = NULL; |
2191 | goto out; |
2192 | } |
2193 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2194 | skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; |
2195 | skb_frag_ref(skb, f: i); |
2196 | } |
2197 | skb_shinfo(n)->nr_frags = i; |
2198 | } |
2199 | |
2200 | if (skb_has_frag_list(skb)) { |
2201 | skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; |
2202 | skb_clone_fraglist(skb: n); |
2203 | } |
2204 | |
2205 | skb_copy_header(n, skb); |
2206 | out: |
2207 | return n; |
2208 | } |
2209 | EXPORT_SYMBOL(__pskb_copy_fclone); |
2210 | |
2211 | /** |
2212 | * pskb_expand_head - reallocate header of &sk_buff |
2213 | * @skb: buffer to reallocate |
2214 | * @nhead: room to add at head |
2215 | * @ntail: room to add at tail |
2216 | * @gfp_mask: allocation priority |
2217 | * |
2218 | * Expands (or creates identical copy, if @nhead and @ntail are zero) |
2219 | * header of @skb. &sk_buff itself is not changed. &sk_buff MUST have |
2220 | * reference count of 1. Returns zero in the case of success or error, |
2221 | * if expansion failed. In the last case, &sk_buff is not changed. |
2222 | * |
2223 | * All the pointers pointing into skb header may change and must be |
2224 | * reloaded after call to this function. |
2225 | */ |
2226 | |
2227 | int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, |
2228 | gfp_t gfp_mask) |
2229 | { |
2230 | unsigned int osize = skb_end_offset(skb); |
2231 | unsigned int size = osize + nhead + ntail; |
2232 | long off; |
2233 | u8 *data; |
2234 | int i; |
2235 | |
2236 | BUG_ON(nhead < 0); |
2237 | |
2238 | BUG_ON(skb_shared(skb)); |
2239 | |
2240 | skb_zcopy_downgrade_managed(skb); |
2241 | |
2242 | if (skb_pfmemalloc(skb)) |
2243 | gfp_mask |= __GFP_MEMALLOC; |
2244 | |
2245 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
2246 | if (!data) |
2247 | goto nodata; |
2248 | size = SKB_WITH_OVERHEAD(size); |
2249 | |
2250 | /* Copy only real data... and, alas, header. This should be |
2251 | * optimized for the cases when header is void. |
2252 | */ |
2253 | memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); |
2254 | |
2255 | memcpy((struct skb_shared_info *)(data + size), |
2256 | skb_shinfo(skb), |
2257 | offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); |
2258 | |
2259 | /* |
2260 | * if shinfo is shared we must drop the old head gracefully, but if it |
2261 | * is not we can just drop the old head and let the existing refcount |
2262 | * be since all we did is relocate the values |
2263 | */ |
2264 | if (skb_cloned(skb)) { |
2265 | if (skb_orphan_frags(skb, gfp_mask)) |
2266 | goto nofrags; |
2267 | if (skb_zcopy(skb)) |
2268 | refcount_inc(r: &skb_uarg(skb)->refcnt); |
2269 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
2270 | skb_frag_ref(skb, f: i); |
2271 | |
2272 | if (skb_has_frag_list(skb)) |
2273 | skb_clone_fraglist(skb); |
2274 | |
2275 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
2276 | } else { |
2277 | skb_free_head(skb, napi_safe: false); |
2278 | } |
2279 | off = (data + nhead) - skb->head; |
2280 | |
2281 | skb->head = data; |
2282 | skb->head_frag = 0; |
2283 | skb->data += off; |
2284 | |
2285 | skb_set_end_offset(skb, offset: size); |
2286 | #ifdef NET_SKBUFF_DATA_USES_OFFSET |
2287 | off = nhead; |
2288 | #endif |
2289 | skb->tail += off; |
2290 | skb_headers_offset_update(skb, nhead); |
2291 | skb->cloned = 0; |
2292 | skb->hdr_len = 0; |
2293 | skb->nohdr = 0; |
2294 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
2295 | |
2296 | skb_metadata_clear(skb); |
2297 | |
2298 | /* It is not generally safe to change skb->truesize. |
2299 | * For the moment, we really care of rx path, or |
2300 | * when skb is orphaned (not attached to a socket). |
2301 | */ |
2302 | if (!skb->sk || skb->destructor == sock_edemux) |
2303 | skb->truesize += size - osize; |
2304 | |
2305 | return 0; |
2306 | |
2307 | nofrags: |
2308 | skb_kfree_head(head: data, end_offset: size); |
2309 | nodata: |
2310 | return -ENOMEM; |
2311 | } |
2312 | EXPORT_SYMBOL(pskb_expand_head); |
2313 | |
2314 | /* Make private copy of skb with writable head and some headroom */ |
2315 | |
2316 | struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) |
2317 | { |
2318 | struct sk_buff *skb2; |
2319 | int delta = headroom - skb_headroom(skb); |
2320 | |
2321 | if (delta <= 0) |
2322 | skb2 = pskb_copy(skb, GFP_ATOMIC); |
2323 | else { |
2324 | skb2 = skb_clone(skb, GFP_ATOMIC); |
2325 | if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, |
2326 | GFP_ATOMIC)) { |
2327 | kfree_skb(skb: skb2); |
2328 | skb2 = NULL; |
2329 | } |
2330 | } |
2331 | return skb2; |
2332 | } |
2333 | EXPORT_SYMBOL(skb_realloc_headroom); |
2334 | |
2335 | /* Note: We plan to rework this in linux-6.4 */ |
2336 | int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) |
2337 | { |
2338 | unsigned int saved_end_offset, saved_truesize; |
2339 | struct skb_shared_info *shinfo; |
2340 | int res; |
2341 | |
2342 | saved_end_offset = skb_end_offset(skb); |
2343 | saved_truesize = skb->truesize; |
2344 | |
2345 | res = pskb_expand_head(skb, 0, 0, pri); |
2346 | if (res) |
2347 | return res; |
2348 | |
2349 | skb->truesize = saved_truesize; |
2350 | |
2351 | if (likely(skb_end_offset(skb) == saved_end_offset)) |
2352 | return 0; |
2353 | |
2354 | /* We can not change skb->end if the original or new value |
2355 | * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head(). |
2356 | */ |
2357 | if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM || |
2358 | skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) { |
2359 | /* We think this path should not be taken. |
2360 | * Add a temporary trace to warn us just in case. |
2361 | */ |
2362 | pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n" , |
2363 | saved_end_offset, skb_end_offset(skb)); |
2364 | WARN_ON_ONCE(1); |
2365 | return 0; |
2366 | } |
2367 | |
2368 | shinfo = skb_shinfo(skb); |
2369 | |
2370 | /* We are about to change back skb->end, |
2371 | * we need to move skb_shinfo() to its new location. |
2372 | */ |
2373 | memmove(skb->head + saved_end_offset, |
2374 | shinfo, |
2375 | offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); |
2376 | |
2377 | skb_set_end_offset(skb, offset: saved_end_offset); |
2378 | |
2379 | return 0; |
2380 | } |
2381 | |
2382 | /** |
2383 | * skb_expand_head - reallocate header of &sk_buff |
2384 | * @skb: buffer to reallocate |
2385 | * @headroom: needed headroom |
2386 | * |
2387 | * Unlike skb_realloc_headroom, this one does not allocate a new skb |
2388 | * if possible; copies skb->sk to new skb as needed |
2389 | * and frees original skb in case of failures. |
2390 | * |
2391 | * It expect increased headroom and generates warning otherwise. |
2392 | */ |
2393 | |
2394 | struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) |
2395 | { |
2396 | int delta = headroom - skb_headroom(skb); |
2397 | int osize = skb_end_offset(skb); |
2398 | struct sock *sk = skb->sk; |
2399 | |
2400 | if (WARN_ONCE(delta <= 0, |
2401 | "%s is expecting an increase in the headroom" , __func__)) |
2402 | return skb; |
2403 | |
2404 | delta = SKB_DATA_ALIGN(delta); |
2405 | /* pskb_expand_head() might crash, if skb is shared. */ |
2406 | if (skb_shared(skb) || !is_skb_wmem(skb)) { |
2407 | struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); |
2408 | |
2409 | if (unlikely(!nskb)) |
2410 | goto fail; |
2411 | |
2412 | if (sk) |
2413 | skb_set_owner_w(skb: nskb, sk); |
2414 | consume_skb(skb); |
2415 | skb = nskb; |
2416 | } |
2417 | if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) |
2418 | goto fail; |
2419 | |
2420 | if (sk && is_skb_wmem(skb)) { |
2421 | delta = skb_end_offset(skb) - osize; |
2422 | refcount_add(i: delta, r: &sk->sk_wmem_alloc); |
2423 | skb->truesize += delta; |
2424 | } |
2425 | return skb; |
2426 | |
2427 | fail: |
2428 | kfree_skb(skb); |
2429 | return NULL; |
2430 | } |
2431 | EXPORT_SYMBOL(skb_expand_head); |
2432 | |
2433 | /** |
2434 | * skb_copy_expand - copy and expand sk_buff |
2435 | * @skb: buffer to copy |
2436 | * @newheadroom: new free bytes at head |
2437 | * @newtailroom: new free bytes at tail |
2438 | * @gfp_mask: allocation priority |
2439 | * |
2440 | * Make a copy of both an &sk_buff and its data and while doing so |
2441 | * allocate additional space. |
2442 | * |
2443 | * This is used when the caller wishes to modify the data and needs a |
2444 | * private copy of the data to alter as well as more space for new fields. |
2445 | * Returns %NULL on failure or the pointer to the buffer |
2446 | * on success. The returned buffer has a reference count of 1. |
2447 | * |
2448 | * You must pass %GFP_ATOMIC as the allocation priority if this function |
2449 | * is called from an interrupt. |
2450 | */ |
2451 | struct sk_buff *skb_copy_expand(const struct sk_buff *skb, |
2452 | int newheadroom, int newtailroom, |
2453 | gfp_t gfp_mask) |
2454 | { |
2455 | /* |
2456 | * Allocate the copy buffer |
2457 | */ |
2458 | struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, |
2459 | gfp_mask, skb_alloc_rx_flag(skb), |
2460 | NUMA_NO_NODE); |
2461 | int oldheadroom = skb_headroom(skb); |
2462 | int head_copy_len, head_copy_off; |
2463 | |
2464 | if (!n) |
2465 | return NULL; |
2466 | |
2467 | skb_reserve(skb: n, len: newheadroom); |
2468 | |
2469 | /* Set the tail pointer and length */ |
2470 | skb_put(skb: n, len: skb->len); |
2471 | |
2472 | head_copy_len = oldheadroom; |
2473 | head_copy_off = 0; |
2474 | if (newheadroom <= head_copy_len) |
2475 | head_copy_len = newheadroom; |
2476 | else |
2477 | head_copy_off = newheadroom - head_copy_len; |
2478 | |
2479 | /* Copy the linear header and data. */ |
2480 | BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, |
2481 | skb->len + head_copy_len)); |
2482 | |
2483 | skb_copy_header(n, skb); |
2484 | |
2485 | skb_headers_offset_update(n, newheadroom - oldheadroom); |
2486 | |
2487 | return n; |
2488 | } |
2489 | EXPORT_SYMBOL(skb_copy_expand); |
2490 | |
2491 | /** |
2492 | * __skb_pad - zero pad the tail of an skb |
2493 | * @skb: buffer to pad |
2494 | * @pad: space to pad |
2495 | * @free_on_error: free buffer on error |
2496 | * |
2497 | * Ensure that a buffer is followed by a padding area that is zero |
2498 | * filled. Used by network drivers which may DMA or transfer data |
2499 | * beyond the buffer end onto the wire. |
2500 | * |
2501 | * May return error in out of memory cases. The skb is freed on error |
2502 | * if @free_on_error is true. |
2503 | */ |
2504 | |
2505 | int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error) |
2506 | { |
2507 | int err; |
2508 | int ntail; |
2509 | |
2510 | /* If the skbuff is non linear tailroom is always zero.. */ |
2511 | if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) { |
2512 | memset(skb->data+skb->len, 0, pad); |
2513 | return 0; |
2514 | } |
2515 | |
2516 | ntail = skb->data_len + pad - (skb->end - skb->tail); |
2517 | if (likely(skb_cloned(skb) || ntail > 0)) { |
2518 | err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC); |
2519 | if (unlikely(err)) |
2520 | goto free_skb; |
2521 | } |
2522 | |
2523 | /* FIXME: The use of this function with non-linear skb's really needs |
2524 | * to be audited. |
2525 | */ |
2526 | err = skb_linearize(skb); |
2527 | if (unlikely(err)) |
2528 | goto free_skb; |
2529 | |
2530 | memset(skb->data + skb->len, 0, pad); |
2531 | return 0; |
2532 | |
2533 | free_skb: |
2534 | if (free_on_error) |
2535 | kfree_skb(skb); |
2536 | return err; |
2537 | } |
2538 | EXPORT_SYMBOL(__skb_pad); |
2539 | |
2540 | /** |
2541 | * pskb_put - add data to the tail of a potentially fragmented buffer |
2542 | * @skb: start of the buffer to use |
2543 | * @tail: tail fragment of the buffer to use |
2544 | * @len: amount of data to add |
2545 | * |
2546 | * This function extends the used data area of the potentially |
2547 | * fragmented buffer. @tail must be the last fragment of @skb -- or |
2548 | * @skb itself. If this would exceed the total buffer size the kernel |
2549 | * will panic. A pointer to the first byte of the extra data is |
2550 | * returned. |
2551 | */ |
2552 | |
2553 | void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) |
2554 | { |
2555 | if (tail != skb) { |
2556 | skb->data_len += len; |
2557 | skb->len += len; |
2558 | } |
2559 | return skb_put(skb: tail, len); |
2560 | } |
2561 | EXPORT_SYMBOL_GPL(pskb_put); |
2562 | |
2563 | /** |
2564 | * skb_put - add data to a buffer |
2565 | * @skb: buffer to use |
2566 | * @len: amount of data to add |
2567 | * |
2568 | * This function extends the used data area of the buffer. If this would |
2569 | * exceed the total buffer size the kernel will panic. A pointer to the |
2570 | * first byte of the extra data is returned. |
2571 | */ |
2572 | void *skb_put(struct sk_buff *skb, unsigned int len) |
2573 | { |
2574 | void *tmp = skb_tail_pointer(skb); |
2575 | SKB_LINEAR_ASSERT(skb); |
2576 | skb->tail += len; |
2577 | skb->len += len; |
2578 | if (unlikely(skb->tail > skb->end)) |
2579 | skb_over_panic(skb, sz: len, addr: __builtin_return_address(0)); |
2580 | return tmp; |
2581 | } |
2582 | EXPORT_SYMBOL(skb_put); |
2583 | |
2584 | /** |
2585 | * skb_push - add data to the start of a buffer |
2586 | * @skb: buffer to use |
2587 | * @len: amount of data to add |
2588 | * |
2589 | * This function extends the used data area of the buffer at the buffer |
2590 | * start. If this would exceed the total buffer headroom the kernel will |
2591 | * panic. A pointer to the first byte of the extra data is returned. |
2592 | */ |
2593 | void *skb_push(struct sk_buff *skb, unsigned int len) |
2594 | { |
2595 | skb->data -= len; |
2596 | skb->len += len; |
2597 | if (unlikely(skb->data < skb->head)) |
2598 | skb_under_panic(skb, sz: len, addr: __builtin_return_address(0)); |
2599 | return skb->data; |
2600 | } |
2601 | EXPORT_SYMBOL(skb_push); |
2602 | |
2603 | /** |
2604 | * skb_pull - remove data from the start of a buffer |
2605 | * @skb: buffer to use |
2606 | * @len: amount of data to remove |
2607 | * |
2608 | * This function removes data from the start of a buffer, returning |
2609 | * the memory to the headroom. A pointer to the next data in the buffer |
2610 | * is returned. Once the data has been pulled future pushes will overwrite |
2611 | * the old data. |
2612 | */ |
2613 | void *skb_pull(struct sk_buff *skb, unsigned int len) |
2614 | { |
2615 | return skb_pull_inline(skb, len); |
2616 | } |
2617 | EXPORT_SYMBOL(skb_pull); |
2618 | |
2619 | /** |
2620 | * skb_pull_data - remove data from the start of a buffer returning its |
2621 | * original position. |
2622 | * @skb: buffer to use |
2623 | * @len: amount of data to remove |
2624 | * |
2625 | * This function removes data from the start of a buffer, returning |
2626 | * the memory to the headroom. A pointer to the original data in the buffer |
2627 | * is returned after checking if there is enough data to pull. Once the |
2628 | * data has been pulled future pushes will overwrite the old data. |
2629 | */ |
2630 | void *skb_pull_data(struct sk_buff *skb, size_t len) |
2631 | { |
2632 | void *data = skb->data; |
2633 | |
2634 | if (skb->len < len) |
2635 | return NULL; |
2636 | |
2637 | skb_pull(skb, len); |
2638 | |
2639 | return data; |
2640 | } |
2641 | EXPORT_SYMBOL(skb_pull_data); |
2642 | |
2643 | /** |
2644 | * skb_trim - remove end from a buffer |
2645 | * @skb: buffer to alter |
2646 | * @len: new length |
2647 | * |
2648 | * Cut the length of a buffer down by removing data from the tail. If |
2649 | * the buffer is already under the length specified it is not modified. |
2650 | * The skb must be linear. |
2651 | */ |
2652 | void skb_trim(struct sk_buff *skb, unsigned int len) |
2653 | { |
2654 | if (skb->len > len) |
2655 | __skb_trim(skb, len); |
2656 | } |
2657 | EXPORT_SYMBOL(skb_trim); |
2658 | |
2659 | /* Trims skb to length len. It can change skb pointers. |
2660 | */ |
2661 | |
2662 | int ___pskb_trim(struct sk_buff *skb, unsigned int len) |
2663 | { |
2664 | struct sk_buff **fragp; |
2665 | struct sk_buff *frag; |
2666 | int offset = skb_headlen(skb); |
2667 | int nfrags = skb_shinfo(skb)->nr_frags; |
2668 | int i; |
2669 | int err; |
2670 | |
2671 | if (skb_cloned(skb) && |
2672 | unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))) |
2673 | return err; |
2674 | |
2675 | i = 0; |
2676 | if (offset >= len) |
2677 | goto drop_pages; |
2678 | |
2679 | for (; i < nfrags; i++) { |
2680 | int end = offset + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2681 | |
2682 | if (end < len) { |
2683 | offset = end; |
2684 | continue; |
2685 | } |
2686 | |
2687 | skb_frag_size_set(frag: &skb_shinfo(skb)->frags[i++], size: len - offset); |
2688 | |
2689 | drop_pages: |
2690 | skb_shinfo(skb)->nr_frags = i; |
2691 | |
2692 | for (; i < nfrags; i++) |
2693 | skb_frag_unref(skb, f: i); |
2694 | |
2695 | if (skb_has_frag_list(skb)) |
2696 | skb_drop_fraglist(skb); |
2697 | goto done; |
2698 | } |
2699 | |
2700 | for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp); |
2701 | fragp = &frag->next) { |
2702 | int end = offset + frag->len; |
2703 | |
2704 | if (skb_shared(skb: frag)) { |
2705 | struct sk_buff *nfrag; |
2706 | |
2707 | nfrag = skb_clone(frag, GFP_ATOMIC); |
2708 | if (unlikely(!nfrag)) |
2709 | return -ENOMEM; |
2710 | |
2711 | nfrag->next = frag->next; |
2712 | consume_skb(frag); |
2713 | frag = nfrag; |
2714 | *fragp = frag; |
2715 | } |
2716 | |
2717 | if (end < len) { |
2718 | offset = end; |
2719 | continue; |
2720 | } |
2721 | |
2722 | if (end > len && |
2723 | unlikely((err = pskb_trim(frag, len - offset)))) |
2724 | return err; |
2725 | |
2726 | if (frag->next) |
2727 | skb_drop_list(listp: &frag->next); |
2728 | break; |
2729 | } |
2730 | |
2731 | done: |
2732 | if (len > skb_headlen(skb)) { |
2733 | skb->data_len -= skb->len - len; |
2734 | skb->len = len; |
2735 | } else { |
2736 | skb->len = len; |
2737 | skb->data_len = 0; |
2738 | skb_set_tail_pointer(skb, offset: len); |
2739 | } |
2740 | |
2741 | if (!skb->sk || skb->destructor == sock_edemux) |
2742 | skb_condense(skb); |
2743 | return 0; |
2744 | } |
2745 | EXPORT_SYMBOL(___pskb_trim); |
2746 | |
2747 | /* Note : use pskb_trim_rcsum() instead of calling this directly |
2748 | */ |
2749 | int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) |
2750 | { |
2751 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
2752 | int delta = skb->len - len; |
2753 | |
2754 | skb->csum = csum_block_sub(csum: skb->csum, |
2755 | csum2: skb_checksum(skb, offset: len, len: delta, csum: 0), |
2756 | offset: len); |
2757 | } else if (skb->ip_summed == CHECKSUM_PARTIAL) { |
2758 | int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; |
2759 | int offset = skb_checksum_start_offset(skb) + skb->csum_offset; |
2760 | |
2761 | if (offset + sizeof(__sum16) > hdlen) |
2762 | return -EINVAL; |
2763 | } |
2764 | return __pskb_trim(skb, len); |
2765 | } |
2766 | EXPORT_SYMBOL(pskb_trim_rcsum_slow); |
2767 | |
2768 | /** |
2769 | * __pskb_pull_tail - advance tail of skb header |
2770 | * @skb: buffer to reallocate |
2771 | * @delta: number of bytes to advance tail |
2772 | * |
2773 | * The function makes a sense only on a fragmented &sk_buff, |
2774 | * it expands header moving its tail forward and copying necessary |
2775 | * data from fragmented part. |
2776 | * |
2777 | * &sk_buff MUST have reference count of 1. |
2778 | * |
2779 | * Returns %NULL (and &sk_buff does not change) if pull failed |
2780 | * or value of new tail of skb in the case of success. |
2781 | * |
2782 | * All the pointers pointing into skb header may change and must be |
2783 | * reloaded after call to this function. |
2784 | */ |
2785 | |
2786 | /* Moves tail of skb head forward, copying data from fragmented part, |
2787 | * when it is necessary. |
2788 | * 1. It may fail due to malloc failure. |
2789 | * 2. It may change skb pointers. |
2790 | * |
2791 | * It is pretty complicated. Luckily, it is called only in exceptional cases. |
2792 | */ |
2793 | void *__pskb_pull_tail(struct sk_buff *skb, int delta) |
2794 | { |
2795 | /* If skb has not enough free space at tail, get new one |
2796 | * plus 128 bytes for future expansions. If we have enough |
2797 | * room at tail, reallocate without expansion only if skb is cloned. |
2798 | */ |
2799 | int i, k, eat = (skb->tail + delta) - skb->end; |
2800 | |
2801 | if (eat > 0 || skb_cloned(skb)) { |
2802 | if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, |
2803 | GFP_ATOMIC)) |
2804 | return NULL; |
2805 | } |
2806 | |
2807 | BUG_ON(skb_copy_bits(skb, skb_headlen(skb), |
2808 | skb_tail_pointer(skb), delta)); |
2809 | |
2810 | /* Optimization: no fragments, no reasons to preestimate |
2811 | * size of pulled pages. Superb. |
2812 | */ |
2813 | if (!skb_has_frag_list(skb)) |
2814 | goto pull_pages; |
2815 | |
2816 | /* Estimate size of pulled pages. */ |
2817 | eat = delta; |
2818 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2819 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2820 | |
2821 | if (size >= eat) |
2822 | goto pull_pages; |
2823 | eat -= size; |
2824 | } |
2825 | |
2826 | /* If we need update frag list, we are in troubles. |
2827 | * Certainly, it is possible to add an offset to skb data, |
2828 | * but taking into account that pulling is expected to |
2829 | * be very rare operation, it is worth to fight against |
2830 | * further bloating skb head and crucify ourselves here instead. |
2831 | * Pure masohism, indeed. 8)8) |
2832 | */ |
2833 | if (eat) { |
2834 | struct sk_buff *list = skb_shinfo(skb)->frag_list; |
2835 | struct sk_buff *clone = NULL; |
2836 | struct sk_buff *insp = NULL; |
2837 | |
2838 | do { |
2839 | if (list->len <= eat) { |
2840 | /* Eaten as whole. */ |
2841 | eat -= list->len; |
2842 | list = list->next; |
2843 | insp = list; |
2844 | } else { |
2845 | /* Eaten partially. */ |
2846 | if (skb_is_gso(skb) && !list->head_frag && |
2847 | skb_headlen(skb: list)) |
2848 | skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; |
2849 | |
2850 | if (skb_shared(skb: list)) { |
2851 | /* Sucks! We need to fork list. :-( */ |
2852 | clone = skb_clone(list, GFP_ATOMIC); |
2853 | if (!clone) |
2854 | return NULL; |
2855 | insp = list->next; |
2856 | list = clone; |
2857 | } else { |
2858 | /* This may be pulled without |
2859 | * problems. */ |
2860 | insp = list; |
2861 | } |
2862 | if (!pskb_pull(skb: list, len: eat)) { |
2863 | kfree_skb(skb: clone); |
2864 | return NULL; |
2865 | } |
2866 | break; |
2867 | } |
2868 | } while (eat); |
2869 | |
2870 | /* Free pulled out fragments. */ |
2871 | while ((list = skb_shinfo(skb)->frag_list) != insp) { |
2872 | skb_shinfo(skb)->frag_list = list->next; |
2873 | consume_skb(list); |
2874 | } |
2875 | /* And insert new clone at head. */ |
2876 | if (clone) { |
2877 | clone->next = list; |
2878 | skb_shinfo(skb)->frag_list = clone; |
2879 | } |
2880 | } |
2881 | /* Success! Now we may commit changes to skb data. */ |
2882 | |
2883 | pull_pages: |
2884 | eat = delta; |
2885 | k = 0; |
2886 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2887 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
2888 | |
2889 | if (size <= eat) { |
2890 | skb_frag_unref(skb, f: i); |
2891 | eat -= size; |
2892 | } else { |
2893 | skb_frag_t *frag = &skb_shinfo(skb)->frags[k]; |
2894 | |
2895 | *frag = skb_shinfo(skb)->frags[i]; |
2896 | if (eat) { |
2897 | skb_frag_off_add(frag, delta: eat); |
2898 | skb_frag_size_sub(frag, delta: eat); |
2899 | if (!i) |
2900 | goto end; |
2901 | eat = 0; |
2902 | } |
2903 | k++; |
2904 | } |
2905 | } |
2906 | skb_shinfo(skb)->nr_frags = k; |
2907 | |
2908 | end: |
2909 | skb->tail += delta; |
2910 | skb->data_len -= delta; |
2911 | |
2912 | if (!skb->data_len) |
2913 | skb_zcopy_clear(skb, zerocopy_success: false); |
2914 | |
2915 | return skb_tail_pointer(skb); |
2916 | } |
2917 | EXPORT_SYMBOL(__pskb_pull_tail); |
2918 | |
2919 | /** |
2920 | * skb_copy_bits - copy bits from skb to kernel buffer |
2921 | * @skb: source skb |
2922 | * @offset: offset in source |
2923 | * @to: destination buffer |
2924 | * @len: number of bytes to copy |
2925 | * |
2926 | * Copy the specified number of bytes from the source skb to the |
2927 | * destination buffer. |
2928 | * |
2929 | * CAUTION ! : |
2930 | * If its prototype is ever changed, |
2931 | * check arch/{*}/net/{*}.S files, |
2932 | * since it is called from BPF assembly code. |
2933 | */ |
2934 | int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) |
2935 | { |
2936 | int start = skb_headlen(skb); |
2937 | struct sk_buff *frag_iter; |
2938 | int i, copy; |
2939 | |
2940 | if (offset > (int)skb->len - len) |
2941 | goto fault; |
2942 | |
2943 | /* Copy header. */ |
2944 | if ((copy = start - offset) > 0) { |
2945 | if (copy > len) |
2946 | copy = len; |
2947 | skb_copy_from_linear_data_offset(skb, offset, to, len: copy); |
2948 | if ((len -= copy) == 0) |
2949 | return 0; |
2950 | offset += copy; |
2951 | to += copy; |
2952 | } |
2953 | |
2954 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
2955 | int end; |
2956 | skb_frag_t *f = &skb_shinfo(skb)->frags[i]; |
2957 | |
2958 | WARN_ON(start > offset + len); |
2959 | |
2960 | end = start + skb_frag_size(frag: f); |
2961 | if ((copy = end - offset) > 0) { |
2962 | u32 p_off, p_len, copied; |
2963 | struct page *p; |
2964 | u8 *vaddr; |
2965 | |
2966 | if (copy > len) |
2967 | copy = len; |
2968 | |
2969 | skb_frag_foreach_page(f, |
2970 | skb_frag_off(f) + offset - start, |
2971 | copy, p, p_off, p_len, copied) { |
2972 | vaddr = kmap_atomic(page: p); |
2973 | memcpy(to + copied, vaddr + p_off, p_len); |
2974 | kunmap_atomic(vaddr); |
2975 | } |
2976 | |
2977 | if ((len -= copy) == 0) |
2978 | return 0; |
2979 | offset += copy; |
2980 | to += copy; |
2981 | } |
2982 | start = end; |
2983 | } |
2984 | |
2985 | skb_walk_frags(skb, frag_iter) { |
2986 | int end; |
2987 | |
2988 | WARN_ON(start > offset + len); |
2989 | |
2990 | end = start + frag_iter->len; |
2991 | if ((copy = end - offset) > 0) { |
2992 | if (copy > len) |
2993 | copy = len; |
2994 | if (skb_copy_bits(skb: frag_iter, offset: offset - start, to, len: copy)) |
2995 | goto fault; |
2996 | if ((len -= copy) == 0) |
2997 | return 0; |
2998 | offset += copy; |
2999 | to += copy; |
3000 | } |
3001 | start = end; |
3002 | } |
3003 | |
3004 | if (!len) |
3005 | return 0; |
3006 | |
3007 | fault: |
3008 | return -EFAULT; |
3009 | } |
3010 | EXPORT_SYMBOL(skb_copy_bits); |
3011 | |
3012 | /* |
3013 | * Callback from splice_to_pipe(), if we need to release some pages |
3014 | * at the end of the spd in case we error'ed out in filling the pipe. |
3015 | */ |
3016 | static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) |
3017 | { |
3018 | put_page(page: spd->pages[i]); |
3019 | } |
3020 | |
3021 | static struct page *linear_to_page(struct page *page, unsigned int *len, |
3022 | unsigned int *offset, |
3023 | struct sock *sk) |
3024 | { |
3025 | struct page_frag *pfrag = sk_page_frag(sk); |
3026 | |
3027 | if (!sk_page_frag_refill(sk, pfrag)) |
3028 | return NULL; |
3029 | |
3030 | *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset); |
3031 | |
3032 | memcpy(page_address(pfrag->page) + pfrag->offset, |
3033 | page_address(page) + *offset, *len); |
3034 | *offset = pfrag->offset; |
3035 | pfrag->offset += *len; |
3036 | |
3037 | return pfrag->page; |
3038 | } |
3039 | |
3040 | static bool spd_can_coalesce(const struct splice_pipe_desc *spd, |
3041 | struct page *page, |
3042 | unsigned int offset) |
3043 | { |
3044 | return spd->nr_pages && |
3045 | spd->pages[spd->nr_pages - 1] == page && |
3046 | (spd->partial[spd->nr_pages - 1].offset + |
3047 | spd->partial[spd->nr_pages - 1].len == offset); |
3048 | } |
3049 | |
3050 | /* |
3051 | * Fill page/offset/length into spd, if it can hold more pages. |
3052 | */ |
3053 | static bool spd_fill_page(struct splice_pipe_desc *spd, |
3054 | struct pipe_inode_info *pipe, struct page *page, |
3055 | unsigned int *len, unsigned int offset, |
3056 | bool linear, |
3057 | struct sock *sk) |
3058 | { |
3059 | if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) |
3060 | return true; |
3061 | |
3062 | if (linear) { |
3063 | page = linear_to_page(page, len, offset: &offset, sk); |
3064 | if (!page) |
3065 | return true; |
3066 | } |
3067 | if (spd_can_coalesce(spd, page, offset)) { |
3068 | spd->partial[spd->nr_pages - 1].len += *len; |
3069 | return false; |
3070 | } |
3071 | get_page(page); |
3072 | spd->pages[spd->nr_pages] = page; |
3073 | spd->partial[spd->nr_pages].len = *len; |
3074 | spd->partial[spd->nr_pages].offset = offset; |
3075 | spd->nr_pages++; |
3076 | |
3077 | return false; |
3078 | } |
3079 | |
3080 | static bool __splice_segment(struct page *page, unsigned int poff, |
3081 | unsigned int plen, unsigned int *off, |
3082 | unsigned int *len, |
3083 | struct splice_pipe_desc *spd, bool linear, |
3084 | struct sock *sk, |
3085 | struct pipe_inode_info *pipe) |
3086 | { |
3087 | if (!*len) |
3088 | return true; |
3089 | |
3090 | /* skip this segment if already processed */ |
3091 | if (*off >= plen) { |
3092 | *off -= plen; |
3093 | return false; |
3094 | } |
3095 | |
3096 | /* ignore any bits we already processed */ |
3097 | poff += *off; |
3098 | plen -= *off; |
3099 | *off = 0; |
3100 | |
3101 | do { |
3102 | unsigned int flen = min(*len, plen); |
3103 | |
3104 | if (spd_fill_page(spd, pipe, page, len: &flen, offset: poff, |
3105 | linear, sk)) |
3106 | return true; |
3107 | poff += flen; |
3108 | plen -= flen; |
3109 | *len -= flen; |
3110 | } while (*len && plen); |
3111 | |
3112 | return false; |
3113 | } |
3114 | |
3115 | /* |
3116 | * Map linear and fragment data from the skb to spd. It reports true if the |
3117 | * pipe is full or if we already spliced the requested length. |
3118 | */ |
3119 | static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, |
3120 | unsigned int *offset, unsigned int *len, |
3121 | struct splice_pipe_desc *spd, struct sock *sk) |
3122 | { |
3123 | int seg; |
3124 | struct sk_buff *iter; |
3125 | |
3126 | /* map the linear part : |
3127 | * If skb->head_frag is set, this 'linear' part is backed by a |
3128 | * fragment, and if the head is not shared with any clones then |
3129 | * we can avoid a copy since we own the head portion of this page. |
3130 | */ |
3131 | if (__splice_segment(virt_to_page(skb->data), |
3132 | poff: (unsigned long) skb->data & (PAGE_SIZE - 1), |
3133 | plen: skb_headlen(skb), |
3134 | off: offset, len, spd, |
3135 | linear: skb_head_is_locked(skb), |
3136 | sk, pipe)) |
3137 | return true; |
3138 | |
3139 | /* |
3140 | * then map the fragments |
3141 | */ |
3142 | for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) { |
3143 | const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; |
3144 | |
3145 | if (__splice_segment(page: skb_frag_page(frag: f), |
3146 | poff: skb_frag_off(frag: f), plen: skb_frag_size(frag: f), |
3147 | off: offset, len, spd, linear: false, sk, pipe)) |
3148 | return true; |
3149 | } |
3150 | |
3151 | skb_walk_frags(skb, iter) { |
3152 | if (*offset >= iter->len) { |
3153 | *offset -= iter->len; |
3154 | continue; |
3155 | } |
3156 | /* __skb_splice_bits() only fails if the output has no room |
3157 | * left, so no point in going over the frag_list for the error |
3158 | * case. |
3159 | */ |
3160 | if (__skb_splice_bits(skb: iter, pipe, offset, len, spd, sk)) |
3161 | return true; |
3162 | } |
3163 | |
3164 | return false; |
3165 | } |
3166 | |
3167 | /* |
3168 | * Map data from the skb to a pipe. Should handle both the linear part, |
3169 | * the fragments, and the frag list. |
3170 | */ |
3171 | int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, |
3172 | struct pipe_inode_info *pipe, unsigned int tlen, |
3173 | unsigned int flags) |
3174 | { |
3175 | struct partial_page partial[MAX_SKB_FRAGS]; |
3176 | struct page *pages[MAX_SKB_FRAGS]; |
3177 | struct splice_pipe_desc spd = { |
3178 | .pages = pages, |
3179 | .partial = partial, |
3180 | .nr_pages_max = MAX_SKB_FRAGS, |
3181 | .ops = &nosteal_pipe_buf_ops, |
3182 | .spd_release = sock_spd_release, |
3183 | }; |
3184 | int ret = 0; |
3185 | |
3186 | __skb_splice_bits(skb, pipe, offset: &offset, len: &tlen, spd: &spd, sk); |
3187 | |
3188 | if (spd.nr_pages) |
3189 | ret = splice_to_pipe(pipe, spd: &spd); |
3190 | |
3191 | return ret; |
3192 | } |
3193 | EXPORT_SYMBOL_GPL(skb_splice_bits); |
3194 | |
3195 | static int sendmsg_locked(struct sock *sk, struct msghdr *msg) |
3196 | { |
3197 | struct socket *sock = sk->sk_socket; |
3198 | size_t size = msg_data_left(msg); |
3199 | |
3200 | if (!sock) |
3201 | return -EINVAL; |
3202 | |
3203 | if (!sock->ops->sendmsg_locked) |
3204 | return sock_no_sendmsg_locked(sk, msg, len: size); |
3205 | |
3206 | return sock->ops->sendmsg_locked(sk, msg, size); |
3207 | } |
3208 | |
3209 | static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) |
3210 | { |
3211 | struct socket *sock = sk->sk_socket; |
3212 | |
3213 | if (!sock) |
3214 | return -EINVAL; |
3215 | return sock_sendmsg(sock, msg); |
3216 | } |
3217 | |
3218 | typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); |
3219 | static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, |
3220 | int len, sendmsg_func sendmsg) |
3221 | { |
3222 | unsigned int orig_len = len; |
3223 | struct sk_buff *head = skb; |
3224 | unsigned short fragidx; |
3225 | int slen, ret; |
3226 | |
3227 | do_frag_list: |
3228 | |
3229 | /* Deal with head data */ |
3230 | while (offset < skb_headlen(skb) && len) { |
3231 | struct kvec kv; |
3232 | struct msghdr msg; |
3233 | |
3234 | slen = min_t(int, len, skb_headlen(skb) - offset); |
3235 | kv.iov_base = skb->data + offset; |
3236 | kv.iov_len = slen; |
3237 | memset(&msg, 0, sizeof(msg)); |
3238 | msg.msg_flags = MSG_DONTWAIT; |
3239 | |
3240 | iov_iter_kvec(i: &msg.msg_iter, ITER_SOURCE, kvec: &kv, nr_segs: 1, count: slen); |
3241 | ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, |
3242 | sendmsg_unlocked, sk, &msg); |
3243 | if (ret <= 0) |
3244 | goto error; |
3245 | |
3246 | offset += ret; |
3247 | len -= ret; |
3248 | } |
3249 | |
3250 | /* All the data was skb head? */ |
3251 | if (!len) |
3252 | goto out; |
3253 | |
3254 | /* Make offset relative to start of frags */ |
3255 | offset -= skb_headlen(skb); |
3256 | |
3257 | /* Find where we are in frag list */ |
3258 | for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { |
3259 | skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; |
3260 | |
3261 | if (offset < skb_frag_size(frag)) |
3262 | break; |
3263 | |
3264 | offset -= skb_frag_size(frag); |
3265 | } |
3266 | |
3267 | for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) { |
3268 | skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx]; |
3269 | |
3270 | slen = min_t(size_t, len, skb_frag_size(frag) - offset); |
3271 | |
3272 | while (slen) { |
3273 | struct bio_vec bvec; |
3274 | struct msghdr msg = { |
3275 | .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, |
3276 | }; |
3277 | |
3278 | bvec_set_page(bv: &bvec, page: skb_frag_page(frag), len: slen, |
3279 | offset: skb_frag_off(frag) + offset); |
3280 | iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, |
3281 | count: slen); |
3282 | |
3283 | ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, |
3284 | sendmsg_unlocked, sk, &msg); |
3285 | if (ret <= 0) |
3286 | goto error; |
3287 | |
3288 | len -= ret; |
3289 | offset += ret; |
3290 | slen -= ret; |
3291 | } |
3292 | |
3293 | offset = 0; |
3294 | } |
3295 | |
3296 | if (len) { |
3297 | /* Process any frag lists */ |
3298 | |
3299 | if (skb == head) { |
3300 | if (skb_has_frag_list(skb)) { |
3301 | skb = skb_shinfo(skb)->frag_list; |
3302 | goto do_frag_list; |
3303 | } |
3304 | } else if (skb->next) { |
3305 | skb = skb->next; |
3306 | goto do_frag_list; |
3307 | } |
3308 | } |
3309 | |
3310 | out: |
3311 | return orig_len - len; |
3312 | |
3313 | error: |
3314 | return orig_len == len ? ret : orig_len - len; |
3315 | } |
3316 | |
3317 | /* Send skb data on a socket. Socket must be locked. */ |
3318 | int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, |
3319 | int len) |
3320 | { |
3321 | return __skb_send_sock(sk, skb, offset, len, sendmsg: sendmsg_locked); |
3322 | } |
3323 | EXPORT_SYMBOL_GPL(skb_send_sock_locked); |
3324 | |
3325 | /* Send skb data on a socket. Socket must be unlocked. */ |
3326 | int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) |
3327 | { |
3328 | return __skb_send_sock(sk, skb, offset, len, sendmsg: sendmsg_unlocked); |
3329 | } |
3330 | |
3331 | /** |
3332 | * skb_store_bits - store bits from kernel buffer to skb |
3333 | * @skb: destination buffer |
3334 | * @offset: offset in destination |
3335 | * @from: source buffer |
3336 | * @len: number of bytes to copy |
3337 | * |
3338 | * Copy the specified number of bytes from the source buffer to the |
3339 | * destination skb. This function handles all the messy bits of |
3340 | * traversing fragment lists and such. |
3341 | */ |
3342 | |
3343 | int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) |
3344 | { |
3345 | int start = skb_headlen(skb); |
3346 | struct sk_buff *frag_iter; |
3347 | int i, copy; |
3348 | |
3349 | if (offset > (int)skb->len - len) |
3350 | goto fault; |
3351 | |
3352 | if ((copy = start - offset) > 0) { |
3353 | if (copy > len) |
3354 | copy = len; |
3355 | skb_copy_to_linear_data_offset(skb, offset, from, len: copy); |
3356 | if ((len -= copy) == 0) |
3357 | return 0; |
3358 | offset += copy; |
3359 | from += copy; |
3360 | } |
3361 | |
3362 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3363 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3364 | int end; |
3365 | |
3366 | WARN_ON(start > offset + len); |
3367 | |
3368 | end = start + skb_frag_size(frag); |
3369 | if ((copy = end - offset) > 0) { |
3370 | u32 p_off, p_len, copied; |
3371 | struct page *p; |
3372 | u8 *vaddr; |
3373 | |
3374 | if (copy > len) |
3375 | copy = len; |
3376 | |
3377 | skb_frag_foreach_page(frag, |
3378 | skb_frag_off(frag) + offset - start, |
3379 | copy, p, p_off, p_len, copied) { |
3380 | vaddr = kmap_atomic(page: p); |
3381 | memcpy(vaddr + p_off, from + copied, p_len); |
3382 | kunmap_atomic(vaddr); |
3383 | } |
3384 | |
3385 | if ((len -= copy) == 0) |
3386 | return 0; |
3387 | offset += copy; |
3388 | from += copy; |
3389 | } |
3390 | start = end; |
3391 | } |
3392 | |
3393 | skb_walk_frags(skb, frag_iter) { |
3394 | int end; |
3395 | |
3396 | WARN_ON(start > offset + len); |
3397 | |
3398 | end = start + frag_iter->len; |
3399 | if ((copy = end - offset) > 0) { |
3400 | if (copy > len) |
3401 | copy = len; |
3402 | if (skb_store_bits(skb: frag_iter, offset: offset - start, |
3403 | from, len: copy)) |
3404 | goto fault; |
3405 | if ((len -= copy) == 0) |
3406 | return 0; |
3407 | offset += copy; |
3408 | from += copy; |
3409 | } |
3410 | start = end; |
3411 | } |
3412 | if (!len) |
3413 | return 0; |
3414 | |
3415 | fault: |
3416 | return -EFAULT; |
3417 | } |
3418 | EXPORT_SYMBOL(skb_store_bits); |
3419 | |
3420 | /* Checksum skb data. */ |
3421 | __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, |
3422 | __wsum csum, const struct skb_checksum_ops *ops) |
3423 | { |
3424 | int start = skb_headlen(skb); |
3425 | int i, copy = start - offset; |
3426 | struct sk_buff *frag_iter; |
3427 | int pos = 0; |
3428 | |
3429 | /* Checksum header. */ |
3430 | if (copy > 0) { |
3431 | if (copy > len) |
3432 | copy = len; |
3433 | csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, |
3434 | skb->data + offset, copy, csum); |
3435 | if ((len -= copy) == 0) |
3436 | return csum; |
3437 | offset += copy; |
3438 | pos = copy; |
3439 | } |
3440 | |
3441 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3442 | int end; |
3443 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3444 | |
3445 | WARN_ON(start > offset + len); |
3446 | |
3447 | end = start + skb_frag_size(frag); |
3448 | if ((copy = end - offset) > 0) { |
3449 | u32 p_off, p_len, copied; |
3450 | struct page *p; |
3451 | __wsum csum2; |
3452 | u8 *vaddr; |
3453 | |
3454 | if (copy > len) |
3455 | copy = len; |
3456 | |
3457 | skb_frag_foreach_page(frag, |
3458 | skb_frag_off(frag) + offset - start, |
3459 | copy, p, p_off, p_len, copied) { |
3460 | vaddr = kmap_atomic(page: p); |
3461 | csum2 = INDIRECT_CALL_1(ops->update, |
3462 | csum_partial_ext, |
3463 | vaddr + p_off, p_len, 0); |
3464 | kunmap_atomic(vaddr); |
3465 | csum = INDIRECT_CALL_1(ops->combine, |
3466 | csum_block_add_ext, csum, |
3467 | csum2, pos, p_len); |
3468 | pos += p_len; |
3469 | } |
3470 | |
3471 | if (!(len -= copy)) |
3472 | return csum; |
3473 | offset += copy; |
3474 | } |
3475 | start = end; |
3476 | } |
3477 | |
3478 | skb_walk_frags(skb, frag_iter) { |
3479 | int end; |
3480 | |
3481 | WARN_ON(start > offset + len); |
3482 | |
3483 | end = start + frag_iter->len; |
3484 | if ((copy = end - offset) > 0) { |
3485 | __wsum csum2; |
3486 | if (copy > len) |
3487 | copy = len; |
3488 | csum2 = __skb_checksum(skb: frag_iter, offset: offset - start, |
3489 | len: copy, csum: 0, ops); |
3490 | csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, |
3491 | csum, csum2, pos, copy); |
3492 | if ((len -= copy) == 0) |
3493 | return csum; |
3494 | offset += copy; |
3495 | pos += copy; |
3496 | } |
3497 | start = end; |
3498 | } |
3499 | BUG_ON(len); |
3500 | |
3501 | return csum; |
3502 | } |
3503 | EXPORT_SYMBOL(__skb_checksum); |
3504 | |
3505 | __wsum skb_checksum(const struct sk_buff *skb, int offset, |
3506 | int len, __wsum csum) |
3507 | { |
3508 | const struct skb_checksum_ops ops = { |
3509 | .update = csum_partial_ext, |
3510 | .combine = csum_block_add_ext, |
3511 | }; |
3512 | |
3513 | return __skb_checksum(skb, offset, len, csum, &ops); |
3514 | } |
3515 | EXPORT_SYMBOL(skb_checksum); |
3516 | |
3517 | /* Both of above in one bottle. */ |
3518 | |
3519 | __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, |
3520 | u8 *to, int len) |
3521 | { |
3522 | int start = skb_headlen(skb); |
3523 | int i, copy = start - offset; |
3524 | struct sk_buff *frag_iter; |
3525 | int pos = 0; |
3526 | __wsum csum = 0; |
3527 | |
3528 | /* Copy header. */ |
3529 | if (copy > 0) { |
3530 | if (copy > len) |
3531 | copy = len; |
3532 | csum = csum_partial_copy_nocheck(src: skb->data + offset, dst: to, |
3533 | len: copy); |
3534 | if ((len -= copy) == 0) |
3535 | return csum; |
3536 | offset += copy; |
3537 | to += copy; |
3538 | pos = copy; |
3539 | } |
3540 | |
3541 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
3542 | int end; |
3543 | |
3544 | WARN_ON(start > offset + len); |
3545 | |
3546 | end = start + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
3547 | if ((copy = end - offset) > 0) { |
3548 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
3549 | u32 p_off, p_len, copied; |
3550 | struct page *p; |
3551 | __wsum csum2; |
3552 | u8 *vaddr; |
3553 | |
3554 | if (copy > len) |
3555 | copy = len; |
3556 | |
3557 | skb_frag_foreach_page(frag, |
3558 | skb_frag_off(frag) + offset - start, |
3559 | copy, p, p_off, p_len, copied) { |
3560 | vaddr = kmap_atomic(page: p); |
3561 | csum2 = csum_partial_copy_nocheck(src: vaddr + p_off, |
3562 | dst: to + copied, |
3563 | len: p_len); |
3564 | kunmap_atomic(vaddr); |
3565 | csum = csum_block_add(csum, csum2, offset: pos); |
3566 | pos += p_len; |
3567 | } |
3568 | |
3569 | if (!(len -= copy)) |
3570 | return csum; |
3571 | offset += copy; |
3572 | to += copy; |
3573 | } |
3574 | start = end; |
3575 | } |
3576 | |
3577 | skb_walk_frags(skb, frag_iter) { |
3578 | __wsum csum2; |
3579 | int end; |
3580 | |
3581 | WARN_ON(start > offset + len); |
3582 | |
3583 | end = start + frag_iter->len; |
3584 | if ((copy = end - offset) > 0) { |
3585 | if (copy > len) |
3586 | copy = len; |
3587 | csum2 = skb_copy_and_csum_bits(skb: frag_iter, |
3588 | offset: offset - start, |
3589 | to, len: copy); |
3590 | csum = csum_block_add(csum, csum2, offset: pos); |
3591 | if ((len -= copy) == 0) |
3592 | return csum; |
3593 | offset += copy; |
3594 | to += copy; |
3595 | pos += copy; |
3596 | } |
3597 | start = end; |
3598 | } |
3599 | BUG_ON(len); |
3600 | return csum; |
3601 | } |
3602 | EXPORT_SYMBOL(skb_copy_and_csum_bits); |
3603 | |
3604 | __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) |
3605 | { |
3606 | __sum16 sum; |
3607 | |
3608 | sum = csum_fold(sum: skb_checksum(skb, 0, len, skb->csum)); |
3609 | /* See comments in __skb_checksum_complete(). */ |
3610 | if (likely(!sum)) { |
3611 | if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && |
3612 | !skb->csum_complete_sw) |
3613 | netdev_rx_csum_fault(dev: skb->dev, skb); |
3614 | } |
3615 | if (!skb_shared(skb)) |
3616 | skb->csum_valid = !sum; |
3617 | return sum; |
3618 | } |
3619 | EXPORT_SYMBOL(__skb_checksum_complete_head); |
3620 | |
3621 | /* This function assumes skb->csum already holds pseudo header's checksum, |
3622 | * which has been changed from the hardware checksum, for example, by |
3623 | * __skb_checksum_validate_complete(). And, the original skb->csum must |
3624 | * have been validated unsuccessfully for CHECKSUM_COMPLETE case. |
3625 | * |
3626 | * It returns non-zero if the recomputed checksum is still invalid, otherwise |
3627 | * zero. The new checksum is stored back into skb->csum unless the skb is |
3628 | * shared. |
3629 | */ |
3630 | __sum16 __skb_checksum_complete(struct sk_buff *skb) |
3631 | { |
3632 | __wsum csum; |
3633 | __sum16 sum; |
3634 | |
3635 | csum = skb_checksum(skb, 0, skb->len, 0); |
3636 | |
3637 | sum = csum_fold(sum: csum_add(csum: skb->csum, addend: csum)); |
3638 | /* This check is inverted, because we already knew the hardware |
3639 | * checksum is invalid before calling this function. So, if the |
3640 | * re-computed checksum is valid instead, then we have a mismatch |
3641 | * between the original skb->csum and skb_checksum(). This means either |
3642 | * the original hardware checksum is incorrect or we screw up skb->csum |
3643 | * when moving skb->data around. |
3644 | */ |
3645 | if (likely(!sum)) { |
3646 | if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && |
3647 | !skb->csum_complete_sw) |
3648 | netdev_rx_csum_fault(dev: skb->dev, skb); |
3649 | } |
3650 | |
3651 | if (!skb_shared(skb)) { |
3652 | /* Save full packet checksum */ |
3653 | skb->csum = csum; |
3654 | skb->ip_summed = CHECKSUM_COMPLETE; |
3655 | skb->csum_complete_sw = 1; |
3656 | skb->csum_valid = !sum; |
3657 | } |
3658 | |
3659 | return sum; |
3660 | } |
3661 | EXPORT_SYMBOL(__skb_checksum_complete); |
3662 | |
3663 | static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) |
3664 | { |
3665 | net_warn_ratelimited( |
3666 | "%s: attempt to compute crc32c without libcrc32c.ko\n" , |
3667 | __func__); |
3668 | return 0; |
3669 | } |
3670 | |
3671 | static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, |
3672 | int offset, int len) |
3673 | { |
3674 | net_warn_ratelimited( |
3675 | "%s: attempt to compute crc32c without libcrc32c.ko\n" , |
3676 | __func__); |
3677 | return 0; |
3678 | } |
3679 | |
3680 | static const struct skb_checksum_ops default_crc32c_ops = { |
3681 | .update = warn_crc32c_csum_update, |
3682 | .combine = warn_crc32c_csum_combine, |
3683 | }; |
3684 | |
3685 | const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = |
3686 | &default_crc32c_ops; |
3687 | EXPORT_SYMBOL(crc32c_csum_stub); |
3688 | |
3689 | /** |
3690 | * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() |
3691 | * @from: source buffer |
3692 | * |
3693 | * Calculates the amount of linear headroom needed in the 'to' skb passed |
3694 | * into skb_zerocopy(). |
3695 | */ |
3696 | unsigned int |
3697 | skb_zerocopy_headlen(const struct sk_buff *from) |
3698 | { |
3699 | unsigned int hlen = 0; |
3700 | |
3701 | if (!from->head_frag || |
3702 | skb_headlen(skb: from) < L1_CACHE_BYTES || |
3703 | skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { |
3704 | hlen = skb_headlen(skb: from); |
3705 | if (!hlen) |
3706 | hlen = from->len; |
3707 | } |
3708 | |
3709 | if (skb_has_frag_list(skb: from)) |
3710 | hlen = from->len; |
3711 | |
3712 | return hlen; |
3713 | } |
3714 | EXPORT_SYMBOL_GPL(skb_zerocopy_headlen); |
3715 | |
3716 | /** |
3717 | * skb_zerocopy - Zero copy skb to skb |
3718 | * @to: destination buffer |
3719 | * @from: source buffer |
3720 | * @len: number of bytes to copy from source buffer |
3721 | * @hlen: size of linear headroom in destination buffer |
3722 | * |
3723 | * Copies up to `len` bytes from `from` to `to` by creating references |
3724 | * to the frags in the source buffer. |
3725 | * |
3726 | * The `hlen` as calculated by skb_zerocopy_headlen() specifies the |
3727 | * headroom in the `to` buffer. |
3728 | * |
3729 | * Return value: |
3730 | * 0: everything is OK |
3731 | * -ENOMEM: couldn't orphan frags of @from due to lack of memory |
3732 | * -EFAULT: skb_copy_bits() found some problem with skb geometry |
3733 | */ |
3734 | int |
3735 | skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) |
3736 | { |
3737 | int i, j = 0; |
3738 | int plen = 0; /* length of skb->head fragment */ |
3739 | int ret; |
3740 | struct page *page; |
3741 | unsigned int offset; |
3742 | |
3743 | BUG_ON(!from->head_frag && !hlen); |
3744 | |
3745 | /* dont bother with small payloads */ |
3746 | if (len <= skb_tailroom(skb: to)) |
3747 | return skb_copy_bits(from, 0, skb_put(to, len), len); |
3748 | |
3749 | if (hlen) { |
3750 | ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); |
3751 | if (unlikely(ret)) |
3752 | return ret; |
3753 | len -= hlen; |
3754 | } else { |
3755 | plen = min_t(int, skb_headlen(from), len); |
3756 | if (plen) { |
3757 | page = virt_to_head_page(x: from->head); |
3758 | offset = from->data - (unsigned char *)page_address(page); |
3759 | __skb_fill_netmem_desc(skb: to, i: 0, netmem: page_to_netmem(page), |
3760 | off: offset, size: plen); |
3761 | get_page(page); |
3762 | j = 1; |
3763 | len -= plen; |
3764 | } |
3765 | } |
3766 | |
3767 | skb_len_add(skb: to, delta: len + plen); |
3768 | |
3769 | if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { |
3770 | skb_tx_error(from); |
3771 | return -ENOMEM; |
3772 | } |
3773 | skb_zerocopy_clone(nskb: to, orig: from, GFP_ATOMIC); |
3774 | |
3775 | for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { |
3776 | int size; |
3777 | |
3778 | if (!len) |
3779 | break; |
3780 | skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; |
3781 | size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]), |
3782 | len); |
3783 | skb_frag_size_set(frag: &skb_shinfo(to)->frags[j], size); |
3784 | len -= size; |
3785 | skb_frag_ref(skb: to, f: j); |
3786 | j++; |
3787 | } |
3788 | skb_shinfo(to)->nr_frags = j; |
3789 | |
3790 | return 0; |
3791 | } |
3792 | EXPORT_SYMBOL_GPL(skb_zerocopy); |
3793 | |
3794 | void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) |
3795 | { |
3796 | __wsum csum; |
3797 | long csstart; |
3798 | |
3799 | if (skb->ip_summed == CHECKSUM_PARTIAL) |
3800 | csstart = skb_checksum_start_offset(skb); |
3801 | else |
3802 | csstart = skb_headlen(skb); |
3803 | |
3804 | BUG_ON(csstart > skb_headlen(skb)); |
3805 | |
3806 | skb_copy_from_linear_data(skb, to, len: csstart); |
3807 | |
3808 | csum = 0; |
3809 | if (csstart != skb->len) |
3810 | csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, |
3811 | skb->len - csstart); |
3812 | |
3813 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
3814 | long csstuff = csstart + skb->csum_offset; |
3815 | |
3816 | *((__sum16 *)(to + csstuff)) = csum_fold(sum: csum); |
3817 | } |
3818 | } |
3819 | EXPORT_SYMBOL(skb_copy_and_csum_dev); |
3820 | |
3821 | /** |
3822 | * skb_dequeue - remove from the head of the queue |
3823 | * @list: list to dequeue from |
3824 | * |
3825 | * Remove the head of the list. The list lock is taken so the function |
3826 | * may be used safely with other locking list functions. The head item is |
3827 | * returned or %NULL if the list is empty. |
3828 | */ |
3829 | |
3830 | struct sk_buff *skb_dequeue(struct sk_buff_head *list) |
3831 | { |
3832 | unsigned long flags; |
3833 | struct sk_buff *result; |
3834 | |
3835 | spin_lock_irqsave(&list->lock, flags); |
3836 | result = __skb_dequeue(list); |
3837 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3838 | return result; |
3839 | } |
3840 | EXPORT_SYMBOL(skb_dequeue); |
3841 | |
3842 | /** |
3843 | * skb_dequeue_tail - remove from the tail of the queue |
3844 | * @list: list to dequeue from |
3845 | * |
3846 | * Remove the tail of the list. The list lock is taken so the function |
3847 | * may be used safely with other locking list functions. The tail item is |
3848 | * returned or %NULL if the list is empty. |
3849 | */ |
3850 | struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) |
3851 | { |
3852 | unsigned long flags; |
3853 | struct sk_buff *result; |
3854 | |
3855 | spin_lock_irqsave(&list->lock, flags); |
3856 | result = __skb_dequeue_tail(list); |
3857 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3858 | return result; |
3859 | } |
3860 | EXPORT_SYMBOL(skb_dequeue_tail); |
3861 | |
3862 | /** |
3863 | * skb_queue_purge_reason - empty a list |
3864 | * @list: list to empty |
3865 | * @reason: drop reason |
3866 | * |
3867 | * Delete all buffers on an &sk_buff list. Each buffer is removed from |
3868 | * the list and one reference dropped. This function takes the list |
3869 | * lock and is atomic with respect to other list locking functions. |
3870 | */ |
3871 | void skb_queue_purge_reason(struct sk_buff_head *list, |
3872 | enum skb_drop_reason reason) |
3873 | { |
3874 | struct sk_buff_head tmp; |
3875 | unsigned long flags; |
3876 | |
3877 | if (skb_queue_empty_lockless(list)) |
3878 | return; |
3879 | |
3880 | __skb_queue_head_init(list: &tmp); |
3881 | |
3882 | spin_lock_irqsave(&list->lock, flags); |
3883 | skb_queue_splice_init(list, head: &tmp); |
3884 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3885 | |
3886 | __skb_queue_purge_reason(list: &tmp, reason); |
3887 | } |
3888 | EXPORT_SYMBOL(skb_queue_purge_reason); |
3889 | |
3890 | /** |
3891 | * skb_rbtree_purge - empty a skb rbtree |
3892 | * @root: root of the rbtree to empty |
3893 | * Return value: the sum of truesizes of all purged skbs. |
3894 | * |
3895 | * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from |
3896 | * the list and one reference dropped. This function does not take |
3897 | * any lock. Synchronization should be handled by the caller (e.g., TCP |
3898 | * out-of-order queue is protected by the socket lock). |
3899 | */ |
3900 | unsigned int skb_rbtree_purge(struct rb_root *root) |
3901 | { |
3902 | struct rb_node *p = rb_first(root); |
3903 | unsigned int sum = 0; |
3904 | |
3905 | while (p) { |
3906 | struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); |
3907 | |
3908 | p = rb_next(p); |
3909 | rb_erase(&skb->rbnode, root); |
3910 | sum += skb->truesize; |
3911 | kfree_skb(skb); |
3912 | } |
3913 | return sum; |
3914 | } |
3915 | |
3916 | void skb_errqueue_purge(struct sk_buff_head *list) |
3917 | { |
3918 | struct sk_buff *skb, *next; |
3919 | struct sk_buff_head kill; |
3920 | unsigned long flags; |
3921 | |
3922 | __skb_queue_head_init(list: &kill); |
3923 | |
3924 | spin_lock_irqsave(&list->lock, flags); |
3925 | skb_queue_walk_safe(list, skb, next) { |
3926 | if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || |
3927 | SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) |
3928 | continue; |
3929 | __skb_unlink(skb, list); |
3930 | __skb_queue_tail(list: &kill, newsk: skb); |
3931 | } |
3932 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3933 | __skb_queue_purge(list: &kill); |
3934 | } |
3935 | EXPORT_SYMBOL(skb_errqueue_purge); |
3936 | |
3937 | /** |
3938 | * skb_queue_head - queue a buffer at the list head |
3939 | * @list: list to use |
3940 | * @newsk: buffer to queue |
3941 | * |
3942 | * Queue a buffer at the start of the list. This function takes the |
3943 | * list lock and can be used safely with other locking &sk_buff functions |
3944 | * safely. |
3945 | * |
3946 | * A buffer cannot be placed on two lists at the same time. |
3947 | */ |
3948 | void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) |
3949 | { |
3950 | unsigned long flags; |
3951 | |
3952 | spin_lock_irqsave(&list->lock, flags); |
3953 | __skb_queue_head(list, newsk); |
3954 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3955 | } |
3956 | EXPORT_SYMBOL(skb_queue_head); |
3957 | |
3958 | /** |
3959 | * skb_queue_tail - queue a buffer at the list tail |
3960 | * @list: list to use |
3961 | * @newsk: buffer to queue |
3962 | * |
3963 | * Queue a buffer at the tail of the list. This function takes the |
3964 | * list lock and can be used safely with other locking &sk_buff functions |
3965 | * safely. |
3966 | * |
3967 | * A buffer cannot be placed on two lists at the same time. |
3968 | */ |
3969 | void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) |
3970 | { |
3971 | unsigned long flags; |
3972 | |
3973 | spin_lock_irqsave(&list->lock, flags); |
3974 | __skb_queue_tail(list, newsk); |
3975 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3976 | } |
3977 | EXPORT_SYMBOL(skb_queue_tail); |
3978 | |
3979 | /** |
3980 | * skb_unlink - remove a buffer from a list |
3981 | * @skb: buffer to remove |
3982 | * @list: list to use |
3983 | * |
3984 | * Remove a packet from a list. The list locks are taken and this |
3985 | * function is atomic with respect to other list locked calls |
3986 | * |
3987 | * You must know what list the SKB is on. |
3988 | */ |
3989 | void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) |
3990 | { |
3991 | unsigned long flags; |
3992 | |
3993 | spin_lock_irqsave(&list->lock, flags); |
3994 | __skb_unlink(skb, list); |
3995 | spin_unlock_irqrestore(lock: &list->lock, flags); |
3996 | } |
3997 | EXPORT_SYMBOL(skb_unlink); |
3998 | |
3999 | /** |
4000 | * skb_append - append a buffer |
4001 | * @old: buffer to insert after |
4002 | * @newsk: buffer to insert |
4003 | * @list: list to use |
4004 | * |
4005 | * Place a packet after a given packet in a list. The list locks are taken |
4006 | * and this function is atomic with respect to other list locked calls. |
4007 | * A buffer cannot be placed on two lists at the same time. |
4008 | */ |
4009 | void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list) |
4010 | { |
4011 | unsigned long flags; |
4012 | |
4013 | spin_lock_irqsave(&list->lock, flags); |
4014 | __skb_queue_after(list, prev: old, newsk); |
4015 | spin_unlock_irqrestore(lock: &list->lock, flags); |
4016 | } |
4017 | EXPORT_SYMBOL(skb_append); |
4018 | |
4019 | static inline void (struct sk_buff *skb, |
4020 | struct sk_buff* skb1, |
4021 | const u32 len, const int pos) |
4022 | { |
4023 | int i; |
4024 | |
4025 | skb_copy_from_linear_data_offset(skb, offset: len, to: skb_put(skb1, pos - len), |
4026 | len: pos - len); |
4027 | /* And move data appendix as is. */ |
4028 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
4029 | skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; |
4030 | |
4031 | skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; |
4032 | skb_shinfo(skb)->nr_frags = 0; |
4033 | skb1->data_len = skb->data_len; |
4034 | skb1->len += skb1->data_len; |
4035 | skb->data_len = 0; |
4036 | skb->len = len; |
4037 | skb_set_tail_pointer(skb, offset: len); |
4038 | } |
4039 | |
4040 | static inline void (struct sk_buff *skb, |
4041 | struct sk_buff* skb1, |
4042 | const u32 len, int pos) |
4043 | { |
4044 | int i, k = 0; |
4045 | const int nfrags = skb_shinfo(skb)->nr_frags; |
4046 | |
4047 | skb_shinfo(skb)->nr_frags = 0; |
4048 | skb1->len = skb1->data_len = skb->len - len; |
4049 | skb->len = len; |
4050 | skb->data_len = len - pos; |
4051 | |
4052 | for (i = 0; i < nfrags; i++) { |
4053 | int size = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
4054 | |
4055 | if (pos + size > len) { |
4056 | skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; |
4057 | |
4058 | if (pos < len) { |
4059 | /* Split frag. |
4060 | * We have two variants in this case: |
4061 | * 1. Move all the frag to the second |
4062 | * part, if it is possible. F.e. |
4063 | * this approach is mandatory for TUX, |
4064 | * where splitting is expensive. |
4065 | * 2. Split is accurately. We make this. |
4066 | */ |
4067 | skb_frag_ref(skb, f: i); |
4068 | skb_frag_off_add(frag: &skb_shinfo(skb1)->frags[0], delta: len - pos); |
4069 | skb_frag_size_sub(frag: &skb_shinfo(skb1)->frags[0], delta: len - pos); |
4070 | skb_frag_size_set(frag: &skb_shinfo(skb)->frags[i], size: len - pos); |
4071 | skb_shinfo(skb)->nr_frags++; |
4072 | } |
4073 | k++; |
4074 | } else |
4075 | skb_shinfo(skb)->nr_frags++; |
4076 | pos += size; |
4077 | } |
4078 | skb_shinfo(skb1)->nr_frags = k; |
4079 | } |
4080 | |
4081 | /** |
4082 | * skb_split - Split fragmented skb to two parts at length len. |
4083 | * @skb: the buffer to split |
4084 | * @skb1: the buffer to receive the second part |
4085 | * @len: new length for skb |
4086 | */ |
4087 | void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) |
4088 | { |
4089 | int pos = skb_headlen(skb); |
4090 | const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY; |
4091 | |
4092 | skb_zcopy_downgrade_managed(skb); |
4093 | |
4094 | skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags; |
4095 | skb_zerocopy_clone(nskb: skb1, orig: skb, gfp_mask: 0); |
4096 | if (len < pos) /* Split line is inside header. */ |
4097 | skb_split_inside_header(skb, skb1, len, pos); |
4098 | else /* Second chunk has no header, nothing to copy. */ |
4099 | skb_split_no_header(skb, skb1, len, pos); |
4100 | } |
4101 | EXPORT_SYMBOL(skb_split); |
4102 | |
4103 | /* Shifting from/to a cloned skb is a no-go. |
4104 | * |
4105 | * Caller cannot keep skb_shinfo related pointers past calling here! |
4106 | */ |
4107 | static int skb_prepare_for_shift(struct sk_buff *skb) |
4108 | { |
4109 | return skb_unclone_keeptruesize(skb, GFP_ATOMIC); |
4110 | } |
4111 | |
4112 | /** |
4113 | * skb_shift - Shifts paged data partially from skb to another |
4114 | * @tgt: buffer into which tail data gets added |
4115 | * @skb: buffer from which the paged data comes from |
4116 | * @shiftlen: shift up to this many bytes |
4117 | * |
4118 | * Attempts to shift up to shiftlen worth of bytes, which may be less than |
4119 | * the length of the skb, from skb to tgt. Returns number bytes shifted. |
4120 | * It's up to caller to free skb if everything was shifted. |
4121 | * |
4122 | * If @tgt runs out of frags, the whole operation is aborted. |
4123 | * |
4124 | * Skb cannot include anything else but paged data while tgt is allowed |
4125 | * to have non-paged data as well. |
4126 | * |
4127 | * TODO: full sized shift could be optimized but that would need |
4128 | * specialized skb free'er to handle frags without up-to-date nr_frags. |
4129 | */ |
4130 | int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen) |
4131 | { |
4132 | int from, to, merge, todo; |
4133 | skb_frag_t *fragfrom, *fragto; |
4134 | |
4135 | BUG_ON(shiftlen > skb->len); |
4136 | |
4137 | if (skb_headlen(skb)) |
4138 | return 0; |
4139 | if (skb_zcopy(skb: tgt) || skb_zcopy(skb)) |
4140 | return 0; |
4141 | |
4142 | todo = shiftlen; |
4143 | from = 0; |
4144 | to = skb_shinfo(tgt)->nr_frags; |
4145 | fragfrom = &skb_shinfo(skb)->frags[from]; |
4146 | |
4147 | /* Actual merge is delayed until the point when we know we can |
4148 | * commit all, so that we don't have to undo partial changes |
4149 | */ |
4150 | if (!to || |
4151 | !skb_can_coalesce(skb: tgt, i: to, page: skb_frag_page(frag: fragfrom), |
4152 | off: skb_frag_off(frag: fragfrom))) { |
4153 | merge = -1; |
4154 | } else { |
4155 | merge = to - 1; |
4156 | |
4157 | todo -= skb_frag_size(frag: fragfrom); |
4158 | if (todo < 0) { |
4159 | if (skb_prepare_for_shift(skb) || |
4160 | skb_prepare_for_shift(skb: tgt)) |
4161 | return 0; |
4162 | |
4163 | /* All previous frag pointers might be stale! */ |
4164 | fragfrom = &skb_shinfo(skb)->frags[from]; |
4165 | fragto = &skb_shinfo(tgt)->frags[merge]; |
4166 | |
4167 | skb_frag_size_add(frag: fragto, delta: shiftlen); |
4168 | skb_frag_size_sub(frag: fragfrom, delta: shiftlen); |
4169 | skb_frag_off_add(frag: fragfrom, delta: shiftlen); |
4170 | |
4171 | goto onlymerged; |
4172 | } |
4173 | |
4174 | from++; |
4175 | } |
4176 | |
4177 | /* Skip full, not-fitting skb to avoid expensive operations */ |
4178 | if ((shiftlen == skb->len) && |
4179 | (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to)) |
4180 | return 0; |
4181 | |
4182 | if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(skb: tgt)) |
4183 | return 0; |
4184 | |
4185 | while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { |
4186 | if (to == MAX_SKB_FRAGS) |
4187 | return 0; |
4188 | |
4189 | fragfrom = &skb_shinfo(skb)->frags[from]; |
4190 | fragto = &skb_shinfo(tgt)->frags[to]; |
4191 | |
4192 | if (todo >= skb_frag_size(frag: fragfrom)) { |
4193 | *fragto = *fragfrom; |
4194 | todo -= skb_frag_size(frag: fragfrom); |
4195 | from++; |
4196 | to++; |
4197 | |
4198 | } else { |
4199 | __skb_frag_ref(frag: fragfrom); |
4200 | skb_frag_page_copy(fragto, fragfrom); |
4201 | skb_frag_off_copy(fragto, fragfrom); |
4202 | skb_frag_size_set(frag: fragto, size: todo); |
4203 | |
4204 | skb_frag_off_add(frag: fragfrom, delta: todo); |
4205 | skb_frag_size_sub(frag: fragfrom, delta: todo); |
4206 | todo = 0; |
4207 | |
4208 | to++; |
4209 | break; |
4210 | } |
4211 | } |
4212 | |
4213 | /* Ready to "commit" this state change to tgt */ |
4214 | skb_shinfo(tgt)->nr_frags = to; |
4215 | |
4216 | if (merge >= 0) { |
4217 | fragfrom = &skb_shinfo(skb)->frags[0]; |
4218 | fragto = &skb_shinfo(tgt)->frags[merge]; |
4219 | |
4220 | skb_frag_size_add(frag: fragto, delta: skb_frag_size(frag: fragfrom)); |
4221 | __skb_frag_unref(frag: fragfrom, recycle: skb->pp_recycle); |
4222 | } |
4223 | |
4224 | /* Reposition in the original skb */ |
4225 | to = 0; |
4226 | while (from < skb_shinfo(skb)->nr_frags) |
4227 | skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; |
4228 | skb_shinfo(skb)->nr_frags = to; |
4229 | |
4230 | BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); |
4231 | |
4232 | onlymerged: |
4233 | /* Most likely the tgt won't ever need its checksum anymore, skb on |
4234 | * the other hand might need it if it needs to be resent |
4235 | */ |
4236 | tgt->ip_summed = CHECKSUM_PARTIAL; |
4237 | skb->ip_summed = CHECKSUM_PARTIAL; |
4238 | |
4239 | skb_len_add(skb, delta: -shiftlen); |
4240 | skb_len_add(skb: tgt, delta: shiftlen); |
4241 | |
4242 | return shiftlen; |
4243 | } |
4244 | |
4245 | /** |
4246 | * skb_prepare_seq_read - Prepare a sequential read of skb data |
4247 | * @skb: the buffer to read |
4248 | * @from: lower offset of data to be read |
4249 | * @to: upper offset of data to be read |
4250 | * @st: state variable |
4251 | * |
4252 | * Initializes the specified state variable. Must be called before |
4253 | * invoking skb_seq_read() for the first time. |
4254 | */ |
4255 | void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, |
4256 | unsigned int to, struct skb_seq_state *st) |
4257 | { |
4258 | st->lower_offset = from; |
4259 | st->upper_offset = to; |
4260 | st->root_skb = st->cur_skb = skb; |
4261 | st->frag_idx = st->stepped_offset = 0; |
4262 | st->frag_data = NULL; |
4263 | st->frag_off = 0; |
4264 | } |
4265 | EXPORT_SYMBOL(skb_prepare_seq_read); |
4266 | |
4267 | /** |
4268 | * skb_seq_read - Sequentially read skb data |
4269 | * @consumed: number of bytes consumed by the caller so far |
4270 | * @data: destination pointer for data to be returned |
4271 | * @st: state variable |
4272 | * |
4273 | * Reads a block of skb data at @consumed relative to the |
4274 | * lower offset specified to skb_prepare_seq_read(). Assigns |
4275 | * the head of the data block to @data and returns the length |
4276 | * of the block or 0 if the end of the skb data or the upper |
4277 | * offset has been reached. |
4278 | * |
4279 | * The caller is not required to consume all of the data |
4280 | * returned, i.e. @consumed is typically set to the number |
4281 | * of bytes already consumed and the next call to |
4282 | * skb_seq_read() will return the remaining part of the block. |
4283 | * |
4284 | * Note 1: The size of each block of data returned can be arbitrary, |
4285 | * this limitation is the cost for zerocopy sequential |
4286 | * reads of potentially non linear data. |
4287 | * |
4288 | * Note 2: Fragment lists within fragments are not implemented |
4289 | * at the moment, state->root_skb could be replaced with |
4290 | * a stack for this purpose. |
4291 | */ |
4292 | unsigned int skb_seq_read(unsigned int consumed, const u8 **data, |
4293 | struct skb_seq_state *st) |
4294 | { |
4295 | unsigned int block_limit, abs_offset = consumed + st->lower_offset; |
4296 | skb_frag_t *frag; |
4297 | |
4298 | if (unlikely(abs_offset >= st->upper_offset)) { |
4299 | if (st->frag_data) { |
4300 | kunmap_atomic(st->frag_data); |
4301 | st->frag_data = NULL; |
4302 | } |
4303 | return 0; |
4304 | } |
4305 | |
4306 | next_skb: |
4307 | block_limit = skb_headlen(skb: st->cur_skb) + st->stepped_offset; |
4308 | |
4309 | if (abs_offset < block_limit && !st->frag_data) { |
4310 | *data = st->cur_skb->data + (abs_offset - st->stepped_offset); |
4311 | return block_limit - abs_offset; |
4312 | } |
4313 | |
4314 | if (st->frag_idx == 0 && !st->frag_data) |
4315 | st->stepped_offset += skb_headlen(skb: st->cur_skb); |
4316 | |
4317 | while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { |
4318 | unsigned int pg_idx, pg_off, pg_sz; |
4319 | |
4320 | frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; |
4321 | |
4322 | pg_idx = 0; |
4323 | pg_off = skb_frag_off(frag); |
4324 | pg_sz = skb_frag_size(frag); |
4325 | |
4326 | if (skb_frag_must_loop(p: skb_frag_page(frag))) { |
4327 | pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT; |
4328 | pg_off = offset_in_page(pg_off + st->frag_off); |
4329 | pg_sz = min_t(unsigned int, pg_sz - st->frag_off, |
4330 | PAGE_SIZE - pg_off); |
4331 | } |
4332 | |
4333 | block_limit = pg_sz + st->stepped_offset; |
4334 | if (abs_offset < block_limit) { |
4335 | if (!st->frag_data) |
4336 | st->frag_data = kmap_atomic(page: skb_frag_page(frag) + pg_idx); |
4337 | |
4338 | *data = (u8 *)st->frag_data + pg_off + |
4339 | (abs_offset - st->stepped_offset); |
4340 | |
4341 | return block_limit - abs_offset; |
4342 | } |
4343 | |
4344 | if (st->frag_data) { |
4345 | kunmap_atomic(st->frag_data); |
4346 | st->frag_data = NULL; |
4347 | } |
4348 | |
4349 | st->stepped_offset += pg_sz; |
4350 | st->frag_off += pg_sz; |
4351 | if (st->frag_off == skb_frag_size(frag)) { |
4352 | st->frag_off = 0; |
4353 | st->frag_idx++; |
4354 | } |
4355 | } |
4356 | |
4357 | if (st->frag_data) { |
4358 | kunmap_atomic(st->frag_data); |
4359 | st->frag_data = NULL; |
4360 | } |
4361 | |
4362 | if (st->root_skb == st->cur_skb && skb_has_frag_list(skb: st->root_skb)) { |
4363 | st->cur_skb = skb_shinfo(st->root_skb)->frag_list; |
4364 | st->frag_idx = 0; |
4365 | goto next_skb; |
4366 | } else if (st->cur_skb->next) { |
4367 | st->cur_skb = st->cur_skb->next; |
4368 | st->frag_idx = 0; |
4369 | goto next_skb; |
4370 | } |
4371 | |
4372 | return 0; |
4373 | } |
4374 | EXPORT_SYMBOL(skb_seq_read); |
4375 | |
4376 | /** |
4377 | * skb_abort_seq_read - Abort a sequential read of skb data |
4378 | * @st: state variable |
4379 | * |
4380 | * Must be called if skb_seq_read() was not called until it |
4381 | * returned 0. |
4382 | */ |
4383 | void skb_abort_seq_read(struct skb_seq_state *st) |
4384 | { |
4385 | if (st->frag_data) |
4386 | kunmap_atomic(st->frag_data); |
4387 | } |
4388 | EXPORT_SYMBOL(skb_abort_seq_read); |
4389 | |
4390 | #define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) |
4391 | |
4392 | static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, |
4393 | struct ts_config *conf, |
4394 | struct ts_state *state) |
4395 | { |
4396 | return skb_seq_read(offset, text, TS_SKB_CB(state)); |
4397 | } |
4398 | |
4399 | static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) |
4400 | { |
4401 | skb_abort_seq_read(TS_SKB_CB(state)); |
4402 | } |
4403 | |
4404 | /** |
4405 | * skb_find_text - Find a text pattern in skb data |
4406 | * @skb: the buffer to look in |
4407 | * @from: search offset |
4408 | * @to: search limit |
4409 | * @config: textsearch configuration |
4410 | * |
4411 | * Finds a pattern in the skb data according to the specified |
4412 | * textsearch configuration. Use textsearch_next() to retrieve |
4413 | * subsequent occurrences of the pattern. Returns the offset |
4414 | * to the first occurrence or UINT_MAX if no match was found. |
4415 | */ |
4416 | unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, |
4417 | unsigned int to, struct ts_config *config) |
4418 | { |
4419 | unsigned int patlen = config->ops->get_pattern_len(config); |
4420 | struct ts_state state; |
4421 | unsigned int ret; |
4422 | |
4423 | BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb)); |
4424 | |
4425 | config->get_next_block = skb_ts_get_next_block; |
4426 | config->finish = skb_ts_finish; |
4427 | |
4428 | skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state)); |
4429 | |
4430 | ret = textsearch_find(conf: config, state: &state); |
4431 | return (ret + patlen <= to - from ? ret : UINT_MAX); |
4432 | } |
4433 | EXPORT_SYMBOL(skb_find_text); |
4434 | |
4435 | int skb_append_pagefrags(struct sk_buff *skb, struct page *page, |
4436 | int offset, size_t size, size_t max_frags) |
4437 | { |
4438 | int i = skb_shinfo(skb)->nr_frags; |
4439 | |
4440 | if (skb_can_coalesce(skb, i, page, off: offset)) { |
4441 | skb_frag_size_add(frag: &skb_shinfo(skb)->frags[i - 1], delta: size); |
4442 | } else if (i < max_frags) { |
4443 | skb_zcopy_downgrade_managed(skb); |
4444 | get_page(page); |
4445 | skb_fill_page_desc_noacc(skb, i, page, off: offset, size); |
4446 | } else { |
4447 | return -EMSGSIZE; |
4448 | } |
4449 | |
4450 | return 0; |
4451 | } |
4452 | EXPORT_SYMBOL_GPL(skb_append_pagefrags); |
4453 | |
4454 | /** |
4455 | * skb_pull_rcsum - pull skb and update receive checksum |
4456 | * @skb: buffer to update |
4457 | * @len: length of data pulled |
4458 | * |
4459 | * This function performs an skb_pull on the packet and updates |
4460 | * the CHECKSUM_COMPLETE checksum. It should be used on |
4461 | * receive path processing instead of skb_pull unless you know |
4462 | * that the checksum difference is zero (e.g., a valid IP header) |
4463 | * or you are setting ip_summed to CHECKSUM_NONE. |
4464 | */ |
4465 | void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) |
4466 | { |
4467 | unsigned char *data = skb->data; |
4468 | |
4469 | BUG_ON(len > skb->len); |
4470 | __skb_pull(skb, len); |
4471 | skb_postpull_rcsum(skb, start: data, len); |
4472 | return skb->data; |
4473 | } |
4474 | EXPORT_SYMBOL_GPL(skb_pull_rcsum); |
4475 | |
4476 | static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb) |
4477 | { |
4478 | skb_frag_t head_frag; |
4479 | struct page *page; |
4480 | |
4481 | page = virt_to_head_page(x: frag_skb->head); |
4482 | skb_frag_fill_page_desc(frag: &head_frag, page, off: frag_skb->data - |
4483 | (unsigned char *)page_address(page), |
4484 | size: skb_headlen(skb: frag_skb)); |
4485 | return head_frag; |
4486 | } |
4487 | |
4488 | struct sk_buff *skb_segment_list(struct sk_buff *skb, |
4489 | netdev_features_t features, |
4490 | unsigned int offset) |
4491 | { |
4492 | struct sk_buff *list_skb = skb_shinfo(skb)->frag_list; |
4493 | unsigned int tnl_hlen = skb_tnl_header_len(inner_skb: skb); |
4494 | unsigned int delta_truesize = 0; |
4495 | unsigned int delta_len = 0; |
4496 | struct sk_buff *tail = NULL; |
4497 | struct sk_buff *nskb, *tmp; |
4498 | int len_diff, err; |
4499 | |
4500 | skb_push(skb, -skb_network_offset(skb) + offset); |
4501 | |
4502 | /* Ensure the head is writeable before touching the shared info */ |
4503 | err = skb_unclone(skb, GFP_ATOMIC); |
4504 | if (err) |
4505 | goto err_linearize; |
4506 | |
4507 | skb_shinfo(skb)->frag_list = NULL; |
4508 | |
4509 | while (list_skb) { |
4510 | nskb = list_skb; |
4511 | list_skb = list_skb->next; |
4512 | |
4513 | err = 0; |
4514 | delta_truesize += nskb->truesize; |
4515 | if (skb_shared(skb: nskb)) { |
4516 | tmp = skb_clone(nskb, GFP_ATOMIC); |
4517 | if (tmp) { |
4518 | consume_skb(nskb); |
4519 | nskb = tmp; |
4520 | err = skb_unclone(skb: nskb, GFP_ATOMIC); |
4521 | } else { |
4522 | err = -ENOMEM; |
4523 | } |
4524 | } |
4525 | |
4526 | if (!tail) |
4527 | skb->next = nskb; |
4528 | else |
4529 | tail->next = nskb; |
4530 | |
4531 | if (unlikely(err)) { |
4532 | nskb->next = list_skb; |
4533 | goto err_linearize; |
4534 | } |
4535 | |
4536 | tail = nskb; |
4537 | |
4538 | delta_len += nskb->len; |
4539 | |
4540 | skb_push(nskb, -skb_network_offset(skb: nskb) + offset); |
4541 | |
4542 | skb_release_head_state(skb: nskb); |
4543 | len_diff = skb_network_header_len(skb: nskb) - skb_network_header_len(skb); |
4544 | __copy_skb_header(new: nskb, old: skb); |
4545 | |
4546 | skb_headers_offset_update(nskb, skb_headroom(skb: nskb) - skb_headroom(skb)); |
4547 | nskb->transport_header += len_diff; |
4548 | skb_copy_from_linear_data_offset(skb, offset: -tnl_hlen, |
4549 | to: nskb->data - tnl_hlen, |
4550 | len: offset + tnl_hlen); |
4551 | |
4552 | if (skb_needs_linearize(skb: nskb, features) && |
4553 | __skb_linearize(skb: nskb)) |
4554 | goto err_linearize; |
4555 | } |
4556 | |
4557 | skb->truesize = skb->truesize - delta_truesize; |
4558 | skb->data_len = skb->data_len - delta_len; |
4559 | skb->len = skb->len - delta_len; |
4560 | |
4561 | skb_gso_reset(skb); |
4562 | |
4563 | skb->prev = tail; |
4564 | |
4565 | if (skb_needs_linearize(skb, features) && |
4566 | __skb_linearize(skb)) |
4567 | goto err_linearize; |
4568 | |
4569 | skb_get(skb); |
4570 | |
4571 | return skb; |
4572 | |
4573 | err_linearize: |
4574 | kfree_skb_list(segs: skb->next); |
4575 | skb->next = NULL; |
4576 | return ERR_PTR(error: -ENOMEM); |
4577 | } |
4578 | EXPORT_SYMBOL_GPL(skb_segment_list); |
4579 | |
4580 | /** |
4581 | * skb_segment - Perform protocol segmentation on skb. |
4582 | * @head_skb: buffer to segment |
4583 | * @features: features for the output path (see dev->features) |
4584 | * |
4585 | * This function performs segmentation on the given skb. It returns |
4586 | * a pointer to the first in a list of new skbs for the segments. |
4587 | * In case of error it returns ERR_PTR(err). |
4588 | */ |
4589 | struct sk_buff *skb_segment(struct sk_buff *head_skb, |
4590 | netdev_features_t features) |
4591 | { |
4592 | struct sk_buff *segs = NULL; |
4593 | struct sk_buff *tail = NULL; |
4594 | struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; |
4595 | unsigned int mss = skb_shinfo(head_skb)->gso_size; |
4596 | unsigned int doffset = head_skb->data - skb_mac_header(skb: head_skb); |
4597 | unsigned int offset = doffset; |
4598 | unsigned int tnl_hlen = skb_tnl_header_len(inner_skb: head_skb); |
4599 | unsigned int partial_segs = 0; |
4600 | unsigned int headroom; |
4601 | unsigned int len = head_skb->len; |
4602 | struct sk_buff *frag_skb; |
4603 | skb_frag_t *frag; |
4604 | __be16 proto; |
4605 | bool csum, sg; |
4606 | int err = -ENOMEM; |
4607 | int i = 0; |
4608 | int nfrags, pos; |
4609 | |
4610 | if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && |
4611 | mss != GSO_BY_FRAGS && mss != skb_headlen(skb: head_skb)) { |
4612 | struct sk_buff *check_skb; |
4613 | |
4614 | for (check_skb = list_skb; check_skb; check_skb = check_skb->next) { |
4615 | if (skb_headlen(skb: check_skb) && !check_skb->head_frag) { |
4616 | /* gso_size is untrusted, and we have a frag_list with |
4617 | * a linear non head_frag item. |
4618 | * |
4619 | * If head_skb's headlen does not fit requested gso_size, |
4620 | * it means that the frag_list members do NOT terminate |
4621 | * on exact gso_size boundaries. Hence we cannot perform |
4622 | * skb_frag_t page sharing. Therefore we must fallback to |
4623 | * copying the frag_list skbs; we do so by disabling SG. |
4624 | */ |
4625 | features &= ~NETIF_F_SG; |
4626 | break; |
4627 | } |
4628 | } |
4629 | } |
4630 | |
4631 | __skb_push(skb: head_skb, len: doffset); |
4632 | proto = skb_network_protocol(skb: head_skb, NULL); |
4633 | if (unlikely(!proto)) |
4634 | return ERR_PTR(error: -EINVAL); |
4635 | |
4636 | sg = !!(features & NETIF_F_SG); |
4637 | csum = !!can_checksum_protocol(features, protocol: proto); |
4638 | |
4639 | if (sg && csum && (mss != GSO_BY_FRAGS)) { |
4640 | if (!(features & NETIF_F_GSO_PARTIAL)) { |
4641 | struct sk_buff *iter; |
4642 | unsigned int frag_len; |
4643 | |
4644 | if (!list_skb || |
4645 | !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) |
4646 | goto normal; |
4647 | |
4648 | /* If we get here then all the required |
4649 | * GSO features except frag_list are supported. |
4650 | * Try to split the SKB to multiple GSO SKBs |
4651 | * with no frag_list. |
4652 | * Currently we can do that only when the buffers don't |
4653 | * have a linear part and all the buffers except |
4654 | * the last are of the same length. |
4655 | */ |
4656 | frag_len = list_skb->len; |
4657 | skb_walk_frags(head_skb, iter) { |
4658 | if (frag_len != iter->len && iter->next) |
4659 | goto normal; |
4660 | if (skb_headlen(skb: iter) && !iter->head_frag) |
4661 | goto normal; |
4662 | |
4663 | len -= iter->len; |
4664 | } |
4665 | |
4666 | if (len != frag_len) |
4667 | goto normal; |
4668 | } |
4669 | |
4670 | /* GSO partial only requires that we trim off any excess that |
4671 | * doesn't fit into an MSS sized block, so take care of that |
4672 | * now. |
4673 | * Cap len to not accidentally hit GSO_BY_FRAGS. |
4674 | */ |
4675 | partial_segs = min(len, GSO_BY_FRAGS - 1) / mss; |
4676 | if (partial_segs > 1) |
4677 | mss *= partial_segs; |
4678 | else |
4679 | partial_segs = 0; |
4680 | } |
4681 | |
4682 | normal: |
4683 | headroom = skb_headroom(skb: head_skb); |
4684 | pos = skb_headlen(skb: head_skb); |
4685 | |
4686 | if (skb_orphan_frags(skb: head_skb, GFP_ATOMIC)) |
4687 | return ERR_PTR(error: -ENOMEM); |
4688 | |
4689 | nfrags = skb_shinfo(head_skb)->nr_frags; |
4690 | frag = skb_shinfo(head_skb)->frags; |
4691 | frag_skb = head_skb; |
4692 | |
4693 | do { |
4694 | struct sk_buff *nskb; |
4695 | skb_frag_t *nskb_frag; |
4696 | int hsize; |
4697 | int size; |
4698 | |
4699 | if (unlikely(mss == GSO_BY_FRAGS)) { |
4700 | len = list_skb->len; |
4701 | } else { |
4702 | len = head_skb->len - offset; |
4703 | if (len > mss) |
4704 | len = mss; |
4705 | } |
4706 | |
4707 | hsize = skb_headlen(skb: head_skb) - offset; |
4708 | |
4709 | if (hsize <= 0 && i >= nfrags && skb_headlen(skb: list_skb) && |
4710 | (skb_headlen(skb: list_skb) == len || sg)) { |
4711 | BUG_ON(skb_headlen(list_skb) > len); |
4712 | |
4713 | nskb = skb_clone(list_skb, GFP_ATOMIC); |
4714 | if (unlikely(!nskb)) |
4715 | goto err; |
4716 | |
4717 | i = 0; |
4718 | nfrags = skb_shinfo(list_skb)->nr_frags; |
4719 | frag = skb_shinfo(list_skb)->frags; |
4720 | frag_skb = list_skb; |
4721 | pos += skb_headlen(skb: list_skb); |
4722 | |
4723 | while (pos < offset + len) { |
4724 | BUG_ON(i >= nfrags); |
4725 | |
4726 | size = skb_frag_size(frag); |
4727 | if (pos + size > offset + len) |
4728 | break; |
4729 | |
4730 | i++; |
4731 | pos += size; |
4732 | frag++; |
4733 | } |
4734 | |
4735 | list_skb = list_skb->next; |
4736 | |
4737 | if (unlikely(pskb_trim(nskb, len))) { |
4738 | kfree_skb(skb: nskb); |
4739 | goto err; |
4740 | } |
4741 | |
4742 | hsize = skb_end_offset(skb: nskb); |
4743 | if (skb_cow_head(skb: nskb, headroom: doffset + headroom)) { |
4744 | kfree_skb(skb: nskb); |
4745 | goto err; |
4746 | } |
4747 | |
4748 | nskb->truesize += skb_end_offset(skb: nskb) - hsize; |
4749 | skb_release_head_state(skb: nskb); |
4750 | __skb_push(skb: nskb, len: doffset); |
4751 | } else { |
4752 | if (hsize < 0) |
4753 | hsize = 0; |
4754 | if (hsize > len || !sg) |
4755 | hsize = len; |
4756 | |
4757 | nskb = __alloc_skb(hsize + doffset + headroom, |
4758 | GFP_ATOMIC, skb_alloc_rx_flag(skb: head_skb), |
4759 | NUMA_NO_NODE); |
4760 | |
4761 | if (unlikely(!nskb)) |
4762 | goto err; |
4763 | |
4764 | skb_reserve(skb: nskb, len: headroom); |
4765 | __skb_put(skb: nskb, len: doffset); |
4766 | } |
4767 | |
4768 | if (segs) |
4769 | tail->next = nskb; |
4770 | else |
4771 | segs = nskb; |
4772 | tail = nskb; |
4773 | |
4774 | __copy_skb_header(new: nskb, old: head_skb); |
4775 | |
4776 | skb_headers_offset_update(nskb, skb_headroom(skb: nskb) - headroom); |
4777 | skb_reset_mac_len(skb: nskb); |
4778 | |
4779 | skb_copy_from_linear_data_offset(skb: head_skb, offset: -tnl_hlen, |
4780 | to: nskb->data - tnl_hlen, |
4781 | len: doffset + tnl_hlen); |
4782 | |
4783 | if (nskb->len == len + doffset) |
4784 | goto perform_csum_check; |
4785 | |
4786 | if (!sg) { |
4787 | if (!csum) { |
4788 | if (!nskb->remcsum_offload) |
4789 | nskb->ip_summed = CHECKSUM_NONE; |
4790 | SKB_GSO_CB(nskb)->csum = |
4791 | skb_copy_and_csum_bits(head_skb, offset, |
4792 | skb_put(nskb, |
4793 | len), |
4794 | len); |
4795 | SKB_GSO_CB(nskb)->csum_start = |
4796 | skb_headroom(skb: nskb) + doffset; |
4797 | } else { |
4798 | if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len)) |
4799 | goto err; |
4800 | } |
4801 | continue; |
4802 | } |
4803 | |
4804 | nskb_frag = skb_shinfo(nskb)->frags; |
4805 | |
4806 | skb_copy_from_linear_data_offset(skb: head_skb, offset, |
4807 | to: skb_put(nskb, hsize), len: hsize); |
4808 | |
4809 | skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & |
4810 | SKBFL_SHARED_FRAG; |
4811 | |
4812 | if (skb_zerocopy_clone(nskb, orig: frag_skb, GFP_ATOMIC)) |
4813 | goto err; |
4814 | |
4815 | while (pos < offset + len) { |
4816 | if (i >= nfrags) { |
4817 | if (skb_orphan_frags(skb: list_skb, GFP_ATOMIC) || |
4818 | skb_zerocopy_clone(nskb, orig: list_skb, |
4819 | GFP_ATOMIC)) |
4820 | goto err; |
4821 | |
4822 | i = 0; |
4823 | nfrags = skb_shinfo(list_skb)->nr_frags; |
4824 | frag = skb_shinfo(list_skb)->frags; |
4825 | frag_skb = list_skb; |
4826 | if (!skb_headlen(skb: list_skb)) { |
4827 | BUG_ON(!nfrags); |
4828 | } else { |
4829 | BUG_ON(!list_skb->head_frag); |
4830 | |
4831 | /* to make room for head_frag. */ |
4832 | i--; |
4833 | frag--; |
4834 | } |
4835 | |
4836 | list_skb = list_skb->next; |
4837 | } |
4838 | |
4839 | if (unlikely(skb_shinfo(nskb)->nr_frags >= |
4840 | MAX_SKB_FRAGS)) { |
4841 | net_warn_ratelimited( |
4842 | "skb_segment: too many frags: %u %u\n" , |
4843 | pos, mss); |
4844 | err = -EINVAL; |
4845 | goto err; |
4846 | } |
4847 | |
4848 | *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag; |
4849 | __skb_frag_ref(frag: nskb_frag); |
4850 | size = skb_frag_size(frag: nskb_frag); |
4851 | |
4852 | if (pos < offset) { |
4853 | skb_frag_off_add(frag: nskb_frag, delta: offset - pos); |
4854 | skb_frag_size_sub(frag: nskb_frag, delta: offset - pos); |
4855 | } |
4856 | |
4857 | skb_shinfo(nskb)->nr_frags++; |
4858 | |
4859 | if (pos + size <= offset + len) { |
4860 | i++; |
4861 | frag++; |
4862 | pos += size; |
4863 | } else { |
4864 | skb_frag_size_sub(frag: nskb_frag, delta: pos + size - (offset + len)); |
4865 | goto skip_fraglist; |
4866 | } |
4867 | |
4868 | nskb_frag++; |
4869 | } |
4870 | |
4871 | skip_fraglist: |
4872 | nskb->data_len = len - hsize; |
4873 | nskb->len += nskb->data_len; |
4874 | nskb->truesize += nskb->data_len; |
4875 | |
4876 | perform_csum_check: |
4877 | if (!csum) { |
4878 | if (skb_has_shared_frag(skb: nskb) && |
4879 | __skb_linearize(skb: nskb)) |
4880 | goto err; |
4881 | |
4882 | if (!nskb->remcsum_offload) |
4883 | nskb->ip_summed = CHECKSUM_NONE; |
4884 | SKB_GSO_CB(nskb)->csum = |
4885 | skb_checksum(nskb, doffset, |
4886 | nskb->len - doffset, 0); |
4887 | SKB_GSO_CB(nskb)->csum_start = |
4888 | skb_headroom(skb: nskb) + doffset; |
4889 | } |
4890 | } while ((offset += len) < head_skb->len); |
4891 | |
4892 | /* Some callers want to get the end of the list. |
4893 | * Put it in segs->prev to avoid walking the list. |
4894 | * (see validate_xmit_skb_list() for example) |
4895 | */ |
4896 | segs->prev = tail; |
4897 | |
4898 | if (partial_segs) { |
4899 | struct sk_buff *iter; |
4900 | int type = skb_shinfo(head_skb)->gso_type; |
4901 | unsigned short gso_size = skb_shinfo(head_skb)->gso_size; |
4902 | |
4903 | /* Update type to add partial and then remove dodgy if set */ |
4904 | type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL; |
4905 | type &= ~SKB_GSO_DODGY; |
4906 | |
4907 | /* Update GSO info and prepare to start updating headers on |
4908 | * our way back down the stack of protocols. |
4909 | */ |
4910 | for (iter = segs; iter; iter = iter->next) { |
4911 | skb_shinfo(iter)->gso_size = gso_size; |
4912 | skb_shinfo(iter)->gso_segs = partial_segs; |
4913 | skb_shinfo(iter)->gso_type = type; |
4914 | SKB_GSO_CB(iter)->data_offset = skb_headroom(skb: iter) + doffset; |
4915 | } |
4916 | |
4917 | if (tail->len - doffset <= gso_size) |
4918 | skb_shinfo(tail)->gso_size = 0; |
4919 | else if (tail != segs) |
4920 | skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size); |
4921 | } |
4922 | |
4923 | /* Following permits correct backpressure, for protocols |
4924 | * using skb_set_owner_w(). |
4925 | * Idea is to tranfert ownership from head_skb to last segment. |
4926 | */ |
4927 | if (head_skb->destructor == sock_wfree) { |
4928 | swap(tail->truesize, head_skb->truesize); |
4929 | swap(tail->destructor, head_skb->destructor); |
4930 | swap(tail->sk, head_skb->sk); |
4931 | } |
4932 | return segs; |
4933 | |
4934 | err: |
4935 | kfree_skb_list(segs); |
4936 | return ERR_PTR(error: err); |
4937 | } |
4938 | EXPORT_SYMBOL_GPL(skb_segment); |
4939 | |
4940 | #ifdef CONFIG_SKB_EXTENSIONS |
4941 | #define SKB_EXT_ALIGN_VALUE 8 |
4942 | #define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE) |
4943 | |
4944 | static const u8 skb_ext_type_len[] = { |
4945 | #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) |
4946 | [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info), |
4947 | #endif |
4948 | #ifdef CONFIG_XFRM |
4949 | [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path), |
4950 | #endif |
4951 | #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) |
4952 | [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext), |
4953 | #endif |
4954 | #if IS_ENABLED(CONFIG_MPTCP) |
4955 | [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), |
4956 | #endif |
4957 | #if IS_ENABLED(CONFIG_MCTP_FLOWS) |
4958 | [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), |
4959 | #endif |
4960 | }; |
4961 | |
4962 | static __always_inline unsigned int skb_ext_total_length(void) |
4963 | { |
4964 | unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); |
4965 | int i; |
4966 | |
4967 | for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) |
4968 | l += skb_ext_type_len[i]; |
4969 | |
4970 | return l; |
4971 | } |
4972 | |
4973 | static void skb_extensions_init(void) |
4974 | { |
4975 | BUILD_BUG_ON(SKB_EXT_NUM >= 8); |
4976 | #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) |
4977 | BUILD_BUG_ON(skb_ext_total_length() > 255); |
4978 | #endif |
4979 | |
4980 | skbuff_ext_cache = kmem_cache_create(name: "skbuff_ext_cache" , |
4981 | SKB_EXT_ALIGN_VALUE * skb_ext_total_length(), |
4982 | align: 0, |
4983 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, |
4984 | NULL); |
4985 | } |
4986 | #else |
4987 | static void skb_extensions_init(void) {} |
4988 | #endif |
4989 | |
4990 | /* The SKB kmem_cache slab is critical for network performance. Never |
4991 | * merge/alias the slab with similar sized objects. This avoids fragmentation |
4992 | * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. |
4993 | */ |
4994 | #ifndef CONFIG_SLUB_TINY |
4995 | #define FLAG_SKB_NO_MERGE SLAB_NO_MERGE |
4996 | #else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ |
4997 | #define FLAG_SKB_NO_MERGE 0 |
4998 | #endif |
4999 | |
5000 | void __init skb_init(void) |
5001 | { |
5002 | net_hotdata.skbuff_cache = kmem_cache_create_usercopy(name: "skbuff_head_cache" , |
5003 | size: sizeof(struct sk_buff), |
5004 | align: 0, |
5005 | SLAB_HWCACHE_ALIGN|SLAB_PANIC| |
5006 | FLAG_SKB_NO_MERGE, |
5007 | offsetof(struct sk_buff, cb), |
5008 | sizeof_field(struct sk_buff, cb), |
5009 | NULL); |
5010 | net_hotdata.skbuff_fclone_cache = kmem_cache_create(name: "skbuff_fclone_cache" , |
5011 | size: sizeof(struct sk_buff_fclones), |
5012 | align: 0, |
5013 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, |
5014 | NULL); |
5015 | /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes. |
5016 | * struct skb_shared_info is located at the end of skb->head, |
5017 | * and should not be copied to/from user. |
5018 | */ |
5019 | net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy(name: "skbuff_small_head" , |
5020 | SKB_SMALL_HEAD_CACHE_SIZE, |
5021 | align: 0, |
5022 | SLAB_HWCACHE_ALIGN | SLAB_PANIC, |
5023 | useroffset: 0, |
5024 | SKB_SMALL_HEAD_HEADROOM, |
5025 | NULL); |
5026 | skb_extensions_init(); |
5027 | } |
5028 | |
5029 | static int |
5030 | __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, |
5031 | unsigned int recursion_level) |
5032 | { |
5033 | int start = skb_headlen(skb); |
5034 | int i, copy = start - offset; |
5035 | struct sk_buff *frag_iter; |
5036 | int elt = 0; |
5037 | |
5038 | if (unlikely(recursion_level >= 24)) |
5039 | return -EMSGSIZE; |
5040 | |
5041 | if (copy > 0) { |
5042 | if (copy > len) |
5043 | copy = len; |
5044 | sg_set_buf(sg, buf: skb->data + offset, buflen: copy); |
5045 | elt++; |
5046 | if ((len -= copy) == 0) |
5047 | return elt; |
5048 | offset += copy; |
5049 | } |
5050 | |
5051 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { |
5052 | int end; |
5053 | |
5054 | WARN_ON(start > offset + len); |
5055 | |
5056 | end = start + skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
5057 | if ((copy = end - offset) > 0) { |
5058 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
5059 | if (unlikely(elt && sg_is_last(&sg[elt - 1]))) |
5060 | return -EMSGSIZE; |
5061 | |
5062 | if (copy > len) |
5063 | copy = len; |
5064 | sg_set_page(sg: &sg[elt], page: skb_frag_page(frag), len: copy, |
5065 | offset: skb_frag_off(frag) + offset - start); |
5066 | elt++; |
5067 | if (!(len -= copy)) |
5068 | return elt; |
5069 | offset += copy; |
5070 | } |
5071 | start = end; |
5072 | } |
5073 | |
5074 | skb_walk_frags(skb, frag_iter) { |
5075 | int end, ret; |
5076 | |
5077 | WARN_ON(start > offset + len); |
5078 | |
5079 | end = start + frag_iter->len; |
5080 | if ((copy = end - offset) > 0) { |
5081 | if (unlikely(elt && sg_is_last(&sg[elt - 1]))) |
5082 | return -EMSGSIZE; |
5083 | |
5084 | if (copy > len) |
5085 | copy = len; |
5086 | ret = __skb_to_sgvec(skb: frag_iter, sg: sg+elt, offset: offset - start, |
5087 | len: copy, recursion_level: recursion_level + 1); |
5088 | if (unlikely(ret < 0)) |
5089 | return ret; |
5090 | elt += ret; |
5091 | if ((len -= copy) == 0) |
5092 | return elt; |
5093 | offset += copy; |
5094 | } |
5095 | start = end; |
5096 | } |
5097 | BUG_ON(len); |
5098 | return elt; |
5099 | } |
5100 | |
5101 | /** |
5102 | * skb_to_sgvec - Fill a scatter-gather list from a socket buffer |
5103 | * @skb: Socket buffer containing the buffers to be mapped |
5104 | * @sg: The scatter-gather list to map into |
5105 | * @offset: The offset into the buffer's contents to start mapping |
5106 | * @len: Length of buffer space to be mapped |
5107 | * |
5108 | * Fill the specified scatter-gather list with mappings/pointers into a |
5109 | * region of the buffer space attached to a socket buffer. Returns either |
5110 | * the number of scatterlist items used, or -EMSGSIZE if the contents |
5111 | * could not fit. |
5112 | */ |
5113 | int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) |
5114 | { |
5115 | int nsg = __skb_to_sgvec(skb, sg, offset, len, recursion_level: 0); |
5116 | |
5117 | if (nsg <= 0) |
5118 | return nsg; |
5119 | |
5120 | sg_mark_end(sg: &sg[nsg - 1]); |
5121 | |
5122 | return nsg; |
5123 | } |
5124 | EXPORT_SYMBOL_GPL(skb_to_sgvec); |
5125 | |
5126 | /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given |
5127 | * sglist without mark the sg which contain last skb data as the end. |
5128 | * So the caller can mannipulate sg list as will when padding new data after |
5129 | * the first call without calling sg_unmark_end to expend sg list. |
5130 | * |
5131 | * Scenario to use skb_to_sgvec_nomark: |
5132 | * 1. sg_init_table |
5133 | * 2. skb_to_sgvec_nomark(payload1) |
5134 | * 3. skb_to_sgvec_nomark(payload2) |
5135 | * |
5136 | * This is equivalent to: |
5137 | * 1. sg_init_table |
5138 | * 2. skb_to_sgvec(payload1) |
5139 | * 3. sg_unmark_end |
5140 | * 4. skb_to_sgvec(payload2) |
5141 | * |
5142 | * When mapping mutilple payload conditionally, skb_to_sgvec_nomark |
5143 | * is more preferable. |
5144 | */ |
5145 | int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, |
5146 | int offset, int len) |
5147 | { |
5148 | return __skb_to_sgvec(skb, sg, offset, len, recursion_level: 0); |
5149 | } |
5150 | EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); |
5151 | |
5152 | |
5153 | |
5154 | /** |
5155 | * skb_cow_data - Check that a socket buffer's data buffers are writable |
5156 | * @skb: The socket buffer to check. |
5157 | * @tailbits: Amount of trailing space to be added |
5158 | * @trailer: Returned pointer to the skb where the @tailbits space begins |
5159 | * |
5160 | * Make sure that the data buffers attached to a socket buffer are |
5161 | * writable. If they are not, private copies are made of the data buffers |
5162 | * and the socket buffer is set to use these instead. |
5163 | * |
5164 | * If @tailbits is given, make sure that there is space to write @tailbits |
5165 | * bytes of data beyond current end of socket buffer. @trailer will be |
5166 | * set to point to the skb in which this space begins. |
5167 | * |
5168 | * The number of scatterlist elements required to completely map the |
5169 | * COW'd and extended socket buffer will be returned. |
5170 | */ |
5171 | int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) |
5172 | { |
5173 | int copyflag; |
5174 | int elt; |
5175 | struct sk_buff *skb1, **skb_p; |
5176 | |
5177 | /* If skb is cloned or its head is paged, reallocate |
5178 | * head pulling out all the pages (pages are considered not writable |
5179 | * at the moment even if they are anonymous). |
5180 | */ |
5181 | if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && |
5182 | !__pskb_pull_tail(skb, __skb_pagelen(skb))) |
5183 | return -ENOMEM; |
5184 | |
5185 | /* Easy case. Most of packets will go this way. */ |
5186 | if (!skb_has_frag_list(skb)) { |
5187 | /* A little of trouble, not enough of space for trailer. |
5188 | * This should not happen, when stack is tuned to generate |
5189 | * good frames. OK, on miss we reallocate and reserve even more |
5190 | * space, 128 bytes is fair. */ |
5191 | |
5192 | if (skb_tailroom(skb) < tailbits && |
5193 | pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) |
5194 | return -ENOMEM; |
5195 | |
5196 | /* Voila! */ |
5197 | *trailer = skb; |
5198 | return 1; |
5199 | } |
5200 | |
5201 | /* Misery. We are in troubles, going to mincer fragments... */ |
5202 | |
5203 | elt = 1; |
5204 | skb_p = &skb_shinfo(skb)->frag_list; |
5205 | copyflag = 0; |
5206 | |
5207 | while ((skb1 = *skb_p) != NULL) { |
5208 | int ntail = 0; |
5209 | |
5210 | /* The fragment is partially pulled by someone, |
5211 | * this can happen on input. Copy it and everything |
5212 | * after it. */ |
5213 | |
5214 | if (skb_shared(skb: skb1)) |
5215 | copyflag = 1; |
5216 | |
5217 | /* If the skb is the last, worry about trailer. */ |
5218 | |
5219 | if (skb1->next == NULL && tailbits) { |
5220 | if (skb_shinfo(skb1)->nr_frags || |
5221 | skb_has_frag_list(skb: skb1) || |
5222 | skb_tailroom(skb: skb1) < tailbits) |
5223 | ntail = tailbits + 128; |
5224 | } |
5225 | |
5226 | if (copyflag || |
5227 | skb_cloned(skb: skb1) || |
5228 | ntail || |
5229 | skb_shinfo(skb1)->nr_frags || |
5230 | skb_has_frag_list(skb: skb1)) { |
5231 | struct sk_buff *skb2; |
5232 | |
5233 | /* Fuck, we are miserable poor guys... */ |
5234 | if (ntail == 0) |
5235 | skb2 = skb_copy(skb1, GFP_ATOMIC); |
5236 | else |
5237 | skb2 = skb_copy_expand(skb1, |
5238 | skb_headroom(skb: skb1), |
5239 | ntail, |
5240 | GFP_ATOMIC); |
5241 | if (unlikely(skb2 == NULL)) |
5242 | return -ENOMEM; |
5243 | |
5244 | if (skb1->sk) |
5245 | skb_set_owner_w(skb: skb2, sk: skb1->sk); |
5246 | |
5247 | /* Looking around. Are we still alive? |
5248 | * OK, link new skb, drop old one */ |
5249 | |
5250 | skb2->next = skb1->next; |
5251 | *skb_p = skb2; |
5252 | kfree_skb(skb: skb1); |
5253 | skb1 = skb2; |
5254 | } |
5255 | elt++; |
5256 | *trailer = skb1; |
5257 | skb_p = &skb1->next; |
5258 | } |
5259 | |
5260 | return elt; |
5261 | } |
5262 | EXPORT_SYMBOL_GPL(skb_cow_data); |
5263 | |
5264 | static void sock_rmem_free(struct sk_buff *skb) |
5265 | { |
5266 | struct sock *sk = skb->sk; |
5267 | |
5268 | atomic_sub(i: skb->truesize, v: &sk->sk_rmem_alloc); |
5269 | } |
5270 | |
5271 | static void skb_set_err_queue(struct sk_buff *skb) |
5272 | { |
5273 | /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING. |
5274 | * So, it is safe to (mis)use it to mark skbs on the error queue. |
5275 | */ |
5276 | skb->pkt_type = PACKET_OUTGOING; |
5277 | BUILD_BUG_ON(PACKET_OUTGOING == 0); |
5278 | } |
5279 | |
5280 | /* |
5281 | * Note: We dont mem charge error packets (no sk_forward_alloc changes) |
5282 | */ |
5283 | int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) |
5284 | { |
5285 | if (atomic_read(v: &sk->sk_rmem_alloc) + skb->truesize >= |
5286 | (unsigned int)READ_ONCE(sk->sk_rcvbuf)) |
5287 | return -ENOMEM; |
5288 | |
5289 | skb_orphan(skb); |
5290 | skb->sk = sk; |
5291 | skb->destructor = sock_rmem_free; |
5292 | atomic_add(i: skb->truesize, v: &sk->sk_rmem_alloc); |
5293 | skb_set_err_queue(skb); |
5294 | |
5295 | /* before exiting rcu section, make sure dst is refcounted */ |
5296 | skb_dst_force(skb); |
5297 | |
5298 | skb_queue_tail(&sk->sk_error_queue, skb); |
5299 | if (!sock_flag(sk, flag: SOCK_DEAD)) |
5300 | sk_error_report(sk); |
5301 | return 0; |
5302 | } |
5303 | EXPORT_SYMBOL(sock_queue_err_skb); |
5304 | |
5305 | static bool is_icmp_err_skb(const struct sk_buff *skb) |
5306 | { |
5307 | return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP || |
5308 | SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6); |
5309 | } |
5310 | |
5311 | struct sk_buff *sock_dequeue_err_skb(struct sock *sk) |
5312 | { |
5313 | struct sk_buff_head *q = &sk->sk_error_queue; |
5314 | struct sk_buff *skb, *skb_next = NULL; |
5315 | bool icmp_next = false; |
5316 | unsigned long flags; |
5317 | |
5318 | if (skb_queue_empty_lockless(list: q)) |
5319 | return NULL; |
5320 | |
5321 | spin_lock_irqsave(&q->lock, flags); |
5322 | skb = __skb_dequeue(list: q); |
5323 | if (skb && (skb_next = skb_peek(list_: q))) { |
5324 | icmp_next = is_icmp_err_skb(skb: skb_next); |
5325 | if (icmp_next) |
5326 | sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno; |
5327 | } |
5328 | spin_unlock_irqrestore(lock: &q->lock, flags); |
5329 | |
5330 | if (is_icmp_err_skb(skb) && !icmp_next) |
5331 | sk->sk_err = 0; |
5332 | |
5333 | if (skb_next) |
5334 | sk_error_report(sk); |
5335 | |
5336 | return skb; |
5337 | } |
5338 | EXPORT_SYMBOL(sock_dequeue_err_skb); |
5339 | |
5340 | /** |
5341 | * skb_clone_sk - create clone of skb, and take reference to socket |
5342 | * @skb: the skb to clone |
5343 | * |
5344 | * This function creates a clone of a buffer that holds a reference on |
5345 | * sk_refcnt. Buffers created via this function are meant to be |
5346 | * returned using sock_queue_err_skb, or free via kfree_skb. |
5347 | * |
5348 | * When passing buffers allocated with this function to sock_queue_err_skb |
5349 | * it is necessary to wrap the call with sock_hold/sock_put in order to |
5350 | * prevent the socket from being released prior to being enqueued on |
5351 | * the sk_error_queue. |
5352 | */ |
5353 | struct sk_buff *skb_clone_sk(struct sk_buff *skb) |
5354 | { |
5355 | struct sock *sk = skb->sk; |
5356 | struct sk_buff *clone; |
5357 | |
5358 | if (!sk || !refcount_inc_not_zero(r: &sk->sk_refcnt)) |
5359 | return NULL; |
5360 | |
5361 | clone = skb_clone(skb, GFP_ATOMIC); |
5362 | if (!clone) { |
5363 | sock_put(sk); |
5364 | return NULL; |
5365 | } |
5366 | |
5367 | clone->sk = sk; |
5368 | clone->destructor = sock_efree; |
5369 | |
5370 | return clone; |
5371 | } |
5372 | EXPORT_SYMBOL(skb_clone_sk); |
5373 | |
5374 | static void __skb_complete_tx_timestamp(struct sk_buff *skb, |
5375 | struct sock *sk, |
5376 | int tstype, |
5377 | bool opt_stats) |
5378 | { |
5379 | struct sock_exterr_skb *serr; |
5380 | int err; |
5381 | |
5382 | BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); |
5383 | |
5384 | serr = SKB_EXT_ERR(skb); |
5385 | memset(serr, 0, sizeof(*serr)); |
5386 | serr->ee.ee_errno = ENOMSG; |
5387 | serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; |
5388 | serr->ee.ee_info = tstype; |
5389 | serr->opt_stats = opt_stats; |
5390 | serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; |
5391 | if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { |
5392 | serr->ee.ee_data = skb_shinfo(skb)->tskey; |
5393 | if (sk_is_tcp(sk)) |
5394 | serr->ee.ee_data -= atomic_read(v: &sk->sk_tskey); |
5395 | } |
5396 | |
5397 | err = sock_queue_err_skb(sk, skb); |
5398 | |
5399 | if (err) |
5400 | kfree_skb(skb); |
5401 | } |
5402 | |
5403 | static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly) |
5404 | { |
5405 | bool ret; |
5406 | |
5407 | if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly)) |
5408 | return true; |
5409 | |
5410 | read_lock_bh(&sk->sk_callback_lock); |
5411 | ret = sk->sk_socket && sk->sk_socket->file && |
5412 | file_ns_capable(file: sk->sk_socket->file, ns: &init_user_ns, CAP_NET_RAW); |
5413 | read_unlock_bh(&sk->sk_callback_lock); |
5414 | return ret; |
5415 | } |
5416 | |
5417 | void skb_complete_tx_timestamp(struct sk_buff *skb, |
5418 | struct skb_shared_hwtstamps *hwtstamps) |
5419 | { |
5420 | struct sock *sk = skb->sk; |
5421 | |
5422 | if (!skb_may_tx_timestamp(sk, tsonly: false)) |
5423 | goto err; |
5424 | |
5425 | /* Take a reference to prevent skb_orphan() from freeing the socket, |
5426 | * but only if the socket refcount is not zero. |
5427 | */ |
5428 | if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { |
5429 | *skb_hwtstamps(skb) = *hwtstamps; |
5430 | __skb_complete_tx_timestamp(skb, sk, tstype: SCM_TSTAMP_SND, opt_stats: false); |
5431 | sock_put(sk); |
5432 | return; |
5433 | } |
5434 | |
5435 | err: |
5436 | kfree_skb(skb); |
5437 | } |
5438 | EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); |
5439 | |
5440 | void __skb_tstamp_tx(struct sk_buff *orig_skb, |
5441 | const struct sk_buff *ack_skb, |
5442 | struct skb_shared_hwtstamps *hwtstamps, |
5443 | struct sock *sk, int tstype) |
5444 | { |
5445 | struct sk_buff *skb; |
5446 | bool tsonly, opt_stats = false; |
5447 | u32 tsflags; |
5448 | |
5449 | if (!sk) |
5450 | return; |
5451 | |
5452 | tsflags = READ_ONCE(sk->sk_tsflags); |
5453 | if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && |
5454 | skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) |
5455 | return; |
5456 | |
5457 | tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; |
5458 | if (!skb_may_tx_timestamp(sk, tsonly)) |
5459 | return; |
5460 | |
5461 | if (tsonly) { |
5462 | #ifdef CONFIG_INET |
5463 | if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && |
5464 | sk_is_tcp(sk)) { |
5465 | skb = tcp_get_timestamping_opt_stats(sk, orig_skb, |
5466 | ack_skb); |
5467 | opt_stats = true; |
5468 | } else |
5469 | #endif |
5470 | skb = alloc_skb(size: 0, GFP_ATOMIC); |
5471 | } else { |
5472 | skb = skb_clone(orig_skb, GFP_ATOMIC); |
5473 | |
5474 | if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) { |
5475 | kfree_skb(skb); |
5476 | return; |
5477 | } |
5478 | } |
5479 | if (!skb) |
5480 | return; |
5481 | |
5482 | if (tsonly) { |
5483 | skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & |
5484 | SKBTX_ANY_TSTAMP; |
5485 | skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; |
5486 | } |
5487 | |
5488 | if (hwtstamps) |
5489 | *skb_hwtstamps(skb) = *hwtstamps; |
5490 | else |
5491 | __net_timestamp(skb); |
5492 | |
5493 | __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); |
5494 | } |
5495 | EXPORT_SYMBOL_GPL(__skb_tstamp_tx); |
5496 | |
5497 | void skb_tstamp_tx(struct sk_buff *orig_skb, |
5498 | struct skb_shared_hwtstamps *hwtstamps) |
5499 | { |
5500 | return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk, |
5501 | SCM_TSTAMP_SND); |
5502 | } |
5503 | EXPORT_SYMBOL_GPL(skb_tstamp_tx); |
5504 | |
5505 | #ifdef CONFIG_WIRELESS |
5506 | void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) |
5507 | { |
5508 | struct sock *sk = skb->sk; |
5509 | struct sock_exterr_skb *serr; |
5510 | int err = 1; |
5511 | |
5512 | skb->wifi_acked_valid = 1; |
5513 | skb->wifi_acked = acked; |
5514 | |
5515 | serr = SKB_EXT_ERR(skb); |
5516 | memset(serr, 0, sizeof(*serr)); |
5517 | serr->ee.ee_errno = ENOMSG; |
5518 | serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; |
5519 | |
5520 | /* Take a reference to prevent skb_orphan() from freeing the socket, |
5521 | * but only if the socket refcount is not zero. |
5522 | */ |
5523 | if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { |
5524 | err = sock_queue_err_skb(sk, skb); |
5525 | sock_put(sk); |
5526 | } |
5527 | if (err) |
5528 | kfree_skb(skb); |
5529 | } |
5530 | EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); |
5531 | #endif /* CONFIG_WIRELESS */ |
5532 | |
5533 | /** |
5534 | * skb_partial_csum_set - set up and verify partial csum values for packet |
5535 | * @skb: the skb to set |
5536 | * @start: the number of bytes after skb->data to start checksumming. |
5537 | * @off: the offset from start to place the checksum. |
5538 | * |
5539 | * For untrusted partially-checksummed packets, we need to make sure the values |
5540 | * for skb->csum_start and skb->csum_offset are valid so we don't oops. |
5541 | * |
5542 | * This function checks and sets those values and skb->ip_summed: if this |
5543 | * returns false you should drop the packet. |
5544 | */ |
5545 | bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) |
5546 | { |
5547 | u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); |
5548 | u32 csum_start = skb_headroom(skb) + (u32)start; |
5549 | |
5550 | if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) { |
5551 | net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n" , |
5552 | start, off, skb_headroom(skb), skb_headlen(skb)); |
5553 | return false; |
5554 | } |
5555 | skb->ip_summed = CHECKSUM_PARTIAL; |
5556 | skb->csum_start = csum_start; |
5557 | skb->csum_offset = off; |
5558 | skb->transport_header = csum_start; |
5559 | return true; |
5560 | } |
5561 | EXPORT_SYMBOL_GPL(skb_partial_csum_set); |
5562 | |
5563 | static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len, |
5564 | unsigned int max) |
5565 | { |
5566 | if (skb_headlen(skb) >= len) |
5567 | return 0; |
5568 | |
5569 | /* If we need to pullup then pullup to the max, so we |
5570 | * won't need to do it again. |
5571 | */ |
5572 | if (max > skb->len) |
5573 | max = skb->len; |
5574 | |
5575 | if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL) |
5576 | return -ENOMEM; |
5577 | |
5578 | if (skb_headlen(skb) < len) |
5579 | return -EPROTO; |
5580 | |
5581 | return 0; |
5582 | } |
5583 | |
5584 | #define MAX_TCP_HDR_LEN (15 * 4) |
5585 | |
5586 | static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb, |
5587 | typeof(IPPROTO_IP) proto, |
5588 | unsigned int off) |
5589 | { |
5590 | int err; |
5591 | |
5592 | switch (proto) { |
5593 | case IPPROTO_TCP: |
5594 | err = skb_maybe_pull_tail(skb, len: off + sizeof(struct tcphdr), |
5595 | max: off + MAX_TCP_HDR_LEN); |
5596 | if (!err && !skb_partial_csum_set(skb, off, |
5597 | offsetof(struct tcphdr, |
5598 | check))) |
5599 | err = -EPROTO; |
5600 | return err ? ERR_PTR(error: err) : &tcp_hdr(skb)->check; |
5601 | |
5602 | case IPPROTO_UDP: |
5603 | err = skb_maybe_pull_tail(skb, len: off + sizeof(struct udphdr), |
5604 | max: off + sizeof(struct udphdr)); |
5605 | if (!err && !skb_partial_csum_set(skb, off, |
5606 | offsetof(struct udphdr, |
5607 | check))) |
5608 | err = -EPROTO; |
5609 | return err ? ERR_PTR(error: err) : &udp_hdr(skb)->check; |
5610 | } |
5611 | |
5612 | return ERR_PTR(error: -EPROTO); |
5613 | } |
5614 | |
5615 | /* This value should be large enough to cover a tagged ethernet header plus |
5616 | * maximally sized IP and TCP or UDP headers. |
5617 | */ |
5618 | #define MAX_IP_HDR_LEN 128 |
5619 | |
5620 | static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate) |
5621 | { |
5622 | unsigned int off; |
5623 | bool fragment; |
5624 | __sum16 *csum; |
5625 | int err; |
5626 | |
5627 | fragment = false; |
5628 | |
5629 | err = skb_maybe_pull_tail(skb, |
5630 | len: sizeof(struct iphdr), |
5631 | MAX_IP_HDR_LEN); |
5632 | if (err < 0) |
5633 | goto out; |
5634 | |
5635 | if (ip_is_fragment(iph: ip_hdr(skb))) |
5636 | fragment = true; |
5637 | |
5638 | off = ip_hdrlen(skb); |
5639 | |
5640 | err = -EPROTO; |
5641 | |
5642 | if (fragment) |
5643 | goto out; |
5644 | |
5645 | csum = skb_checksum_setup_ip(skb, proto: ip_hdr(skb)->protocol, off); |
5646 | if (IS_ERR(ptr: csum)) |
5647 | return PTR_ERR(ptr: csum); |
5648 | |
5649 | if (recalculate) |
5650 | *csum = ~csum_tcpudp_magic(saddr: ip_hdr(skb)->saddr, |
5651 | daddr: ip_hdr(skb)->daddr, |
5652 | len: skb->len - off, |
5653 | proto: ip_hdr(skb)->protocol, sum: 0); |
5654 | err = 0; |
5655 | |
5656 | out: |
5657 | return err; |
5658 | } |
5659 | |
5660 | /* This value should be large enough to cover a tagged ethernet header plus |
5661 | * an IPv6 header, all options, and a maximal TCP or UDP header. |
5662 | */ |
5663 | #define MAX_IPV6_HDR_LEN 256 |
5664 | |
5665 | #define OPT_HDR(type, skb, off) \ |
5666 | (type *)(skb_network_header(skb) + (off)) |
5667 | |
5668 | static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate) |
5669 | { |
5670 | int err; |
5671 | u8 nexthdr; |
5672 | unsigned int off; |
5673 | unsigned int len; |
5674 | bool fragment; |
5675 | bool done; |
5676 | __sum16 *csum; |
5677 | |
5678 | fragment = false; |
5679 | done = false; |
5680 | |
5681 | off = sizeof(struct ipv6hdr); |
5682 | |
5683 | err = skb_maybe_pull_tail(skb, len: off, MAX_IPV6_HDR_LEN); |
5684 | if (err < 0) |
5685 | goto out; |
5686 | |
5687 | nexthdr = ipv6_hdr(skb)->nexthdr; |
5688 | |
5689 | len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); |
5690 | while (off <= len && !done) { |
5691 | switch (nexthdr) { |
5692 | case IPPROTO_DSTOPTS: |
5693 | case IPPROTO_HOPOPTS: |
5694 | case IPPROTO_ROUTING: { |
5695 | struct ipv6_opt_hdr *hp; |
5696 | |
5697 | err = skb_maybe_pull_tail(skb, |
5698 | len: off + |
5699 | sizeof(struct ipv6_opt_hdr), |
5700 | MAX_IPV6_HDR_LEN); |
5701 | if (err < 0) |
5702 | goto out; |
5703 | |
5704 | hp = OPT_HDR(struct ipv6_opt_hdr, skb, off); |
5705 | nexthdr = hp->nexthdr; |
5706 | off += ipv6_optlen(hp); |
5707 | break; |
5708 | } |
5709 | case IPPROTO_AH: { |
5710 | struct ip_auth_hdr *hp; |
5711 | |
5712 | err = skb_maybe_pull_tail(skb, |
5713 | len: off + |
5714 | sizeof(struct ip_auth_hdr), |
5715 | MAX_IPV6_HDR_LEN); |
5716 | if (err < 0) |
5717 | goto out; |
5718 | |
5719 | hp = OPT_HDR(struct ip_auth_hdr, skb, off); |
5720 | nexthdr = hp->nexthdr; |
5721 | off += ipv6_authlen(hp); |
5722 | break; |
5723 | } |
5724 | case IPPROTO_FRAGMENT: { |
5725 | struct frag_hdr *hp; |
5726 | |
5727 | err = skb_maybe_pull_tail(skb, |
5728 | len: off + |
5729 | sizeof(struct frag_hdr), |
5730 | MAX_IPV6_HDR_LEN); |
5731 | if (err < 0) |
5732 | goto out; |
5733 | |
5734 | hp = OPT_HDR(struct frag_hdr, skb, off); |
5735 | |
5736 | if (hp->frag_off & htons(IP6_OFFSET | IP6_MF)) |
5737 | fragment = true; |
5738 | |
5739 | nexthdr = hp->nexthdr; |
5740 | off += sizeof(struct frag_hdr); |
5741 | break; |
5742 | } |
5743 | default: |
5744 | done = true; |
5745 | break; |
5746 | } |
5747 | } |
5748 | |
5749 | err = -EPROTO; |
5750 | |
5751 | if (!done || fragment) |
5752 | goto out; |
5753 | |
5754 | csum = skb_checksum_setup_ip(skb, proto: nexthdr, off); |
5755 | if (IS_ERR(ptr: csum)) |
5756 | return PTR_ERR(ptr: csum); |
5757 | |
5758 | if (recalculate) |
5759 | *csum = ~csum_ipv6_magic(saddr: &ipv6_hdr(skb)->saddr, |
5760 | daddr: &ipv6_hdr(skb)->daddr, |
5761 | len: skb->len - off, proto: nexthdr, sum: 0); |
5762 | err = 0; |
5763 | |
5764 | out: |
5765 | return err; |
5766 | } |
5767 | |
5768 | /** |
5769 | * skb_checksum_setup - set up partial checksum offset |
5770 | * @skb: the skb to set up |
5771 | * @recalculate: if true the pseudo-header checksum will be recalculated |
5772 | */ |
5773 | int skb_checksum_setup(struct sk_buff *skb, bool recalculate) |
5774 | { |
5775 | int err; |
5776 | |
5777 | switch (skb->protocol) { |
5778 | case htons(ETH_P_IP): |
5779 | err = skb_checksum_setup_ipv4(skb, recalculate); |
5780 | break; |
5781 | |
5782 | case htons(ETH_P_IPV6): |
5783 | err = skb_checksum_setup_ipv6(skb, recalculate); |
5784 | break; |
5785 | |
5786 | default: |
5787 | err = -EPROTO; |
5788 | break; |
5789 | } |
5790 | |
5791 | return err; |
5792 | } |
5793 | EXPORT_SYMBOL(skb_checksum_setup); |
5794 | |
5795 | /** |
5796 | * skb_checksum_maybe_trim - maybe trims the given skb |
5797 | * @skb: the skb to check |
5798 | * @transport_len: the data length beyond the network header |
5799 | * |
5800 | * Checks whether the given skb has data beyond the given transport length. |
5801 | * If so, returns a cloned skb trimmed to this transport length. |
5802 | * Otherwise returns the provided skb. Returns NULL in error cases |
5803 | * (e.g. transport_len exceeds skb length or out-of-memory). |
5804 | * |
5805 | * Caller needs to set the skb transport header and free any returned skb if it |
5806 | * differs from the provided skb. |
5807 | */ |
5808 | static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, |
5809 | unsigned int transport_len) |
5810 | { |
5811 | struct sk_buff *skb_chk; |
5812 | unsigned int len = skb_transport_offset(skb) + transport_len; |
5813 | int ret; |
5814 | |
5815 | if (skb->len < len) |
5816 | return NULL; |
5817 | else if (skb->len == len) |
5818 | return skb; |
5819 | |
5820 | skb_chk = skb_clone(skb, GFP_ATOMIC); |
5821 | if (!skb_chk) |
5822 | return NULL; |
5823 | |
5824 | ret = pskb_trim_rcsum(skb: skb_chk, len); |
5825 | if (ret) { |
5826 | kfree_skb(skb: skb_chk); |
5827 | return NULL; |
5828 | } |
5829 | |
5830 | return skb_chk; |
5831 | } |
5832 | |
5833 | /** |
5834 | * skb_checksum_trimmed - validate checksum of an skb |
5835 | * @skb: the skb to check |
5836 | * @transport_len: the data length beyond the network header |
5837 | * @skb_chkf: checksum function to use |
5838 | * |
5839 | * Applies the given checksum function skb_chkf to the provided skb. |
5840 | * Returns a checked and maybe trimmed skb. Returns NULL on error. |
5841 | * |
5842 | * If the skb has data beyond the given transport length, then a |
5843 | * trimmed & cloned skb is checked and returned. |
5844 | * |
5845 | * Caller needs to set the skb transport header and free any returned skb if it |
5846 | * differs from the provided skb. |
5847 | */ |
5848 | struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, |
5849 | unsigned int transport_len, |
5850 | __sum16(*skb_chkf)(struct sk_buff *skb)) |
5851 | { |
5852 | struct sk_buff *skb_chk; |
5853 | unsigned int offset = skb_transport_offset(skb); |
5854 | __sum16 ret; |
5855 | |
5856 | skb_chk = skb_checksum_maybe_trim(skb, transport_len); |
5857 | if (!skb_chk) |
5858 | goto err; |
5859 | |
5860 | if (!pskb_may_pull(skb: skb_chk, len: offset)) |
5861 | goto err; |
5862 | |
5863 | skb_pull_rcsum(skb_chk, offset); |
5864 | ret = skb_chkf(skb_chk); |
5865 | skb_push_rcsum(skb: skb_chk, len: offset); |
5866 | |
5867 | if (ret) |
5868 | goto err; |
5869 | |
5870 | return skb_chk; |
5871 | |
5872 | err: |
5873 | if (skb_chk && skb_chk != skb) |
5874 | kfree_skb(skb: skb_chk); |
5875 | |
5876 | return NULL; |
5877 | |
5878 | } |
5879 | EXPORT_SYMBOL(skb_checksum_trimmed); |
5880 | |
5881 | void __skb_warn_lro_forwarding(const struct sk_buff *skb) |
5882 | { |
5883 | net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n" , |
5884 | skb->dev->name); |
5885 | } |
5886 | EXPORT_SYMBOL(__skb_warn_lro_forwarding); |
5887 | |
5888 | void kfree_skb_partial(struct sk_buff *skb, bool head_stolen) |
5889 | { |
5890 | if (head_stolen) { |
5891 | skb_release_head_state(skb); |
5892 | kmem_cache_free(s: net_hotdata.skbuff_cache, objp: skb); |
5893 | } else { |
5894 | __kfree_skb(skb); |
5895 | } |
5896 | } |
5897 | EXPORT_SYMBOL(kfree_skb_partial); |
5898 | |
5899 | /** |
5900 | * skb_try_coalesce - try to merge skb to prior one |
5901 | * @to: prior buffer |
5902 | * @from: buffer to add |
5903 | * @fragstolen: pointer to boolean |
5904 | * @delta_truesize: how much more was allocated than was requested |
5905 | */ |
5906 | bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, |
5907 | bool *fragstolen, int *delta_truesize) |
5908 | { |
5909 | struct skb_shared_info *to_shinfo, *from_shinfo; |
5910 | int i, delta, len = from->len; |
5911 | |
5912 | *fragstolen = false; |
5913 | |
5914 | if (skb_cloned(skb: to)) |
5915 | return false; |
5916 | |
5917 | /* In general, avoid mixing page_pool and non-page_pool allocated |
5918 | * pages within the same SKB. In theory we could take full |
5919 | * references if @from is cloned and !@to->pp_recycle but its |
5920 | * tricky (due to potential race with the clone disappearing) and |
5921 | * rare, so not worth dealing with. |
5922 | */ |
5923 | if (to->pp_recycle != from->pp_recycle) |
5924 | return false; |
5925 | |
5926 | if (len <= skb_tailroom(skb: to)) { |
5927 | if (len) |
5928 | BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len)); |
5929 | *delta_truesize = 0; |
5930 | return true; |
5931 | } |
5932 | |
5933 | to_shinfo = skb_shinfo(to); |
5934 | from_shinfo = skb_shinfo(from); |
5935 | if (to_shinfo->frag_list || from_shinfo->frag_list) |
5936 | return false; |
5937 | if (skb_zcopy(skb: to) || skb_zcopy(skb: from)) |
5938 | return false; |
5939 | |
5940 | if (skb_headlen(skb: from) != 0) { |
5941 | struct page *page; |
5942 | unsigned int offset; |
5943 | |
5944 | if (to_shinfo->nr_frags + |
5945 | from_shinfo->nr_frags >= MAX_SKB_FRAGS) |
5946 | return false; |
5947 | |
5948 | if (skb_head_is_locked(skb: from)) |
5949 | return false; |
5950 | |
5951 | delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); |
5952 | |
5953 | page = virt_to_head_page(x: from->head); |
5954 | offset = from->data - (unsigned char *)page_address(page); |
5955 | |
5956 | skb_fill_page_desc(skb: to, i: to_shinfo->nr_frags, |
5957 | page, off: offset, size: skb_headlen(skb: from)); |
5958 | *fragstolen = true; |
5959 | } else { |
5960 | if (to_shinfo->nr_frags + |
5961 | from_shinfo->nr_frags > MAX_SKB_FRAGS) |
5962 | return false; |
5963 | |
5964 | delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from)); |
5965 | } |
5966 | |
5967 | WARN_ON_ONCE(delta < len); |
5968 | |
5969 | memcpy(to_shinfo->frags + to_shinfo->nr_frags, |
5970 | from_shinfo->frags, |
5971 | from_shinfo->nr_frags * sizeof(skb_frag_t)); |
5972 | to_shinfo->nr_frags += from_shinfo->nr_frags; |
5973 | |
5974 | if (!skb_cloned(skb: from)) |
5975 | from_shinfo->nr_frags = 0; |
5976 | |
5977 | /* if the skb is not cloned this does nothing |
5978 | * since we set nr_frags to 0. |
5979 | */ |
5980 | if (skb_pp_frag_ref(skb: from)) { |
5981 | for (i = 0; i < from_shinfo->nr_frags; i++) |
5982 | __skb_frag_ref(frag: &from_shinfo->frags[i]); |
5983 | } |
5984 | |
5985 | to->truesize += delta; |
5986 | to->len += len; |
5987 | to->data_len += len; |
5988 | |
5989 | *delta_truesize = delta; |
5990 | return true; |
5991 | } |
5992 | EXPORT_SYMBOL(skb_try_coalesce); |
5993 | |
5994 | /** |
5995 | * skb_scrub_packet - scrub an skb |
5996 | * |
5997 | * @skb: buffer to clean |
5998 | * @xnet: packet is crossing netns |
5999 | * |
6000 | * skb_scrub_packet can be used after encapsulating or decapsulting a packet |
6001 | * into/from a tunnel. Some information have to be cleared during these |
6002 | * operations. |
6003 | * skb_scrub_packet can also be used to clean a skb before injecting it in |
6004 | * another namespace (@xnet == true). We have to clear all information in the |
6005 | * skb that could impact namespace isolation. |
6006 | */ |
6007 | void skb_scrub_packet(struct sk_buff *skb, bool xnet) |
6008 | { |
6009 | skb->pkt_type = PACKET_HOST; |
6010 | skb->skb_iif = 0; |
6011 | skb->ignore_df = 0; |
6012 | skb_dst_drop(skb); |
6013 | skb_ext_reset(skb); |
6014 | nf_reset_ct(skb); |
6015 | nf_reset_trace(skb); |
6016 | |
6017 | #ifdef CONFIG_NET_SWITCHDEV |
6018 | skb->offload_fwd_mark = 0; |
6019 | skb->offload_l3_fwd_mark = 0; |
6020 | #endif |
6021 | |
6022 | if (!xnet) |
6023 | return; |
6024 | |
6025 | ipvs_reset(skb); |
6026 | skb->mark = 0; |
6027 | skb_clear_tstamp(skb); |
6028 | } |
6029 | EXPORT_SYMBOL_GPL(skb_scrub_packet); |
6030 | |
6031 | static struct sk_buff *(struct sk_buff *skb) |
6032 | { |
6033 | int mac_len, meta_len; |
6034 | void *meta; |
6035 | |
6036 | if (skb_cow(skb, headroom: skb_headroom(skb)) < 0) { |
6037 | kfree_skb(skb); |
6038 | return NULL; |
6039 | } |
6040 | |
6041 | mac_len = skb->data - skb_mac_header(skb); |
6042 | if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) { |
6043 | memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb), |
6044 | mac_len - VLAN_HLEN - ETH_TLEN); |
6045 | } |
6046 | |
6047 | meta_len = skb_metadata_len(skb); |
6048 | if (meta_len) { |
6049 | meta = skb_metadata_end(skb) - meta_len; |
6050 | memmove(meta + VLAN_HLEN, meta, meta_len); |
6051 | } |
6052 | |
6053 | skb->mac_header += VLAN_HLEN; |
6054 | return skb; |
6055 | } |
6056 | |
6057 | struct sk_buff *skb_vlan_untag(struct sk_buff *skb) |
6058 | { |
6059 | struct vlan_hdr *vhdr; |
6060 | u16 vlan_tci; |
6061 | |
6062 | if (unlikely(skb_vlan_tag_present(skb))) { |
6063 | /* vlan_tci is already set-up so leave this for another time */ |
6064 | return skb; |
6065 | } |
6066 | |
6067 | skb = skb_share_check(skb, GFP_ATOMIC); |
6068 | if (unlikely(!skb)) |
6069 | goto err_free; |
6070 | /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */ |
6071 | if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short)))) |
6072 | goto err_free; |
6073 | |
6074 | vhdr = (struct vlan_hdr *)skb->data; |
6075 | vlan_tci = ntohs(vhdr->h_vlan_TCI); |
6076 | __vlan_hwaccel_put_tag(skb, vlan_proto: skb->protocol, vlan_tci); |
6077 | |
6078 | skb_pull_rcsum(skb, VLAN_HLEN); |
6079 | vlan_set_encap_proto(skb, vhdr); |
6080 | |
6081 | skb = skb_reorder_vlan_header(skb); |
6082 | if (unlikely(!skb)) |
6083 | goto err_free; |
6084 | |
6085 | skb_reset_network_header(skb); |
6086 | if (!skb_transport_header_was_set(skb)) |
6087 | skb_reset_transport_header(skb); |
6088 | skb_reset_mac_len(skb); |
6089 | |
6090 | return skb; |
6091 | |
6092 | err_free: |
6093 | kfree_skb(skb); |
6094 | return NULL; |
6095 | } |
6096 | EXPORT_SYMBOL(skb_vlan_untag); |
6097 | |
6098 | int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len) |
6099 | { |
6100 | if (!pskb_may_pull(skb, len: write_len)) |
6101 | return -ENOMEM; |
6102 | |
6103 | if (!skb_cloned(skb) || skb_clone_writable(skb, len: write_len)) |
6104 | return 0; |
6105 | |
6106 | return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); |
6107 | } |
6108 | EXPORT_SYMBOL(skb_ensure_writable); |
6109 | |
6110 | int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev) |
6111 | { |
6112 | int needed_headroom = dev->needed_headroom; |
6113 | int needed_tailroom = dev->needed_tailroom; |
6114 | |
6115 | /* For tail taggers, we need to pad short frames ourselves, to ensure |
6116 | * that the tail tag does not fail at its role of being at the end of |
6117 | * the packet, once the conduit interface pads the frame. Account for |
6118 | * that pad length here, and pad later. |
6119 | */ |
6120 | if (unlikely(needed_tailroom && skb->len < ETH_ZLEN)) |
6121 | needed_tailroom += ETH_ZLEN - skb->len; |
6122 | /* skb_headroom() returns unsigned int... */ |
6123 | needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0); |
6124 | needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0); |
6125 | |
6126 | if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb))) |
6127 | /* No reallocation needed, yay! */ |
6128 | return 0; |
6129 | |
6130 | return pskb_expand_head(skb, needed_headroom, needed_tailroom, |
6131 | GFP_ATOMIC); |
6132 | } |
6133 | EXPORT_SYMBOL(skb_ensure_writable_head_tail); |
6134 | |
6135 | /* remove VLAN header from packet and update csum accordingly. |
6136 | * expects a non skb_vlan_tag_present skb with a vlan tag payload |
6137 | */ |
6138 | int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) |
6139 | { |
6140 | int offset = skb->data - skb_mac_header(skb); |
6141 | int err; |
6142 | |
6143 | if (WARN_ONCE(offset, |
6144 | "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n" , |
6145 | offset)) { |
6146 | return -EINVAL; |
6147 | } |
6148 | |
6149 | err = skb_ensure_writable(skb, VLAN_ETH_HLEN); |
6150 | if (unlikely(err)) |
6151 | return err; |
6152 | |
6153 | skb_postpull_rcsum(skb, start: skb->data + (2 * ETH_ALEN), VLAN_HLEN); |
6154 | |
6155 | vlan_remove_tag(skb, vlan_tci); |
6156 | |
6157 | skb->mac_header += VLAN_HLEN; |
6158 | |
6159 | if (skb_network_offset(skb) < ETH_HLEN) |
6160 | skb_set_network_header(skb, ETH_HLEN); |
6161 | |
6162 | skb_reset_mac_len(skb); |
6163 | |
6164 | return err; |
6165 | } |
6166 | EXPORT_SYMBOL(__skb_vlan_pop); |
6167 | |
6168 | /* Pop a vlan tag either from hwaccel or from payload. |
6169 | * Expects skb->data at mac header. |
6170 | */ |
6171 | int skb_vlan_pop(struct sk_buff *skb) |
6172 | { |
6173 | u16 vlan_tci; |
6174 | __be16 vlan_proto; |
6175 | int err; |
6176 | |
6177 | if (likely(skb_vlan_tag_present(skb))) { |
6178 | __vlan_hwaccel_clear_tag(skb); |
6179 | } else { |
6180 | if (unlikely(!eth_type_vlan(skb->protocol))) |
6181 | return 0; |
6182 | |
6183 | err = __skb_vlan_pop(skb, &vlan_tci); |
6184 | if (err) |
6185 | return err; |
6186 | } |
6187 | /* move next vlan tag to hw accel tag */ |
6188 | if (likely(!eth_type_vlan(skb->protocol))) |
6189 | return 0; |
6190 | |
6191 | vlan_proto = skb->protocol; |
6192 | err = __skb_vlan_pop(skb, &vlan_tci); |
6193 | if (unlikely(err)) |
6194 | return err; |
6195 | |
6196 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); |
6197 | return 0; |
6198 | } |
6199 | EXPORT_SYMBOL(skb_vlan_pop); |
6200 | |
6201 | /* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present). |
6202 | * Expects skb->data at mac header. |
6203 | */ |
6204 | int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) |
6205 | { |
6206 | if (skb_vlan_tag_present(skb)) { |
6207 | int offset = skb->data - skb_mac_header(skb); |
6208 | int err; |
6209 | |
6210 | if (WARN_ONCE(offset, |
6211 | "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n" , |
6212 | offset)) { |
6213 | return -EINVAL; |
6214 | } |
6215 | |
6216 | err = __vlan_insert_tag(skb, vlan_proto: skb->vlan_proto, |
6217 | skb_vlan_tag_get(skb)); |
6218 | if (err) |
6219 | return err; |
6220 | |
6221 | skb->protocol = skb->vlan_proto; |
6222 | skb->mac_len += VLAN_HLEN; |
6223 | |
6224 | skb_postpush_rcsum(skb, start: skb->data + (2 * ETH_ALEN), VLAN_HLEN); |
6225 | } |
6226 | __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); |
6227 | return 0; |
6228 | } |
6229 | EXPORT_SYMBOL(skb_vlan_push); |
6230 | |
6231 | /** |
6232 | * skb_eth_pop() - Drop the Ethernet header at the head of a packet |
6233 | * |
6234 | * @skb: Socket buffer to modify |
6235 | * |
6236 | * Drop the Ethernet header of @skb. |
6237 | * |
6238 | * Expects that skb->data points to the mac header and that no VLAN tags are |
6239 | * present. |
6240 | * |
6241 | * Returns 0 on success, -errno otherwise. |
6242 | */ |
6243 | int skb_eth_pop(struct sk_buff *skb) |
6244 | { |
6245 | if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) || |
6246 | skb_network_offset(skb) < ETH_HLEN) |
6247 | return -EPROTO; |
6248 | |
6249 | skb_pull_rcsum(skb, ETH_HLEN); |
6250 | skb_reset_mac_header(skb); |
6251 | skb_reset_mac_len(skb); |
6252 | |
6253 | return 0; |
6254 | } |
6255 | EXPORT_SYMBOL(skb_eth_pop); |
6256 | |
6257 | /** |
6258 | * skb_eth_push() - Add a new Ethernet header at the head of a packet |
6259 | * |
6260 | * @skb: Socket buffer to modify |
6261 | * @dst: Destination MAC address of the new header |
6262 | * @src: Source MAC address of the new header |
6263 | * |
6264 | * Prepend @skb with a new Ethernet header. |
6265 | * |
6266 | * Expects that skb->data points to the mac header, which must be empty. |
6267 | * |
6268 | * Returns 0 on success, -errno otherwise. |
6269 | */ |
6270 | int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, |
6271 | const unsigned char *src) |
6272 | { |
6273 | struct ethhdr *eth; |
6274 | int err; |
6275 | |
6276 | if (skb_network_offset(skb) || skb_vlan_tag_present(skb)) |
6277 | return -EPROTO; |
6278 | |
6279 | err = skb_cow_head(skb, headroom: sizeof(*eth)); |
6280 | if (err < 0) |
6281 | return err; |
6282 | |
6283 | skb_push(skb, sizeof(*eth)); |
6284 | skb_reset_mac_header(skb); |
6285 | skb_reset_mac_len(skb); |
6286 | |
6287 | eth = eth_hdr(skb); |
6288 | ether_addr_copy(dst: eth->h_dest, src: dst); |
6289 | ether_addr_copy(dst: eth->h_source, src); |
6290 | eth->h_proto = skb->protocol; |
6291 | |
6292 | skb_postpush_rcsum(skb, start: eth, len: sizeof(*eth)); |
6293 | |
6294 | return 0; |
6295 | } |
6296 | EXPORT_SYMBOL(skb_eth_push); |
6297 | |
6298 | /* Update the ethertype of hdr and the skb csum value if required. */ |
6299 | static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr, |
6300 | __be16 ethertype) |
6301 | { |
6302 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
6303 | __be16 diff[] = { ~hdr->h_proto, ethertype }; |
6304 | |
6305 | skb->csum = csum_partial(buff: (char *)diff, len: sizeof(diff), sum: skb->csum); |
6306 | } |
6307 | |
6308 | hdr->h_proto = ethertype; |
6309 | } |
6310 | |
6311 | /** |
6312 | * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of |
6313 | * the packet |
6314 | * |
6315 | * @skb: buffer |
6316 | * @mpls_lse: MPLS label stack entry to push |
6317 | * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848) |
6318 | * @mac_len: length of the MAC header |
6319 | * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is |
6320 | * ethernet |
6321 | * |
6322 | * Expects skb->data at mac header. |
6323 | * |
6324 | * Returns 0 on success, -errno otherwise. |
6325 | */ |
6326 | int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, |
6327 | int mac_len, bool ethernet) |
6328 | { |
6329 | struct mpls_shim_hdr *lse; |
6330 | int err; |
6331 | |
6332 | if (unlikely(!eth_p_mpls(mpls_proto))) |
6333 | return -EINVAL; |
6334 | |
6335 | /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */ |
6336 | if (skb->encapsulation) |
6337 | return -EINVAL; |
6338 | |
6339 | err = skb_cow_head(skb, MPLS_HLEN); |
6340 | if (unlikely(err)) |
6341 | return err; |
6342 | |
6343 | if (!skb->inner_protocol) { |
6344 | skb_set_inner_network_header(skb, offset: skb_network_offset(skb)); |
6345 | skb_set_inner_protocol(skb, protocol: skb->protocol); |
6346 | } |
6347 | |
6348 | skb_push(skb, MPLS_HLEN); |
6349 | memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), |
6350 | mac_len); |
6351 | skb_reset_mac_header(skb); |
6352 | skb_set_network_header(skb, offset: mac_len); |
6353 | skb_reset_mac_len(skb); |
6354 | |
6355 | lse = mpls_hdr(skb); |
6356 | lse->label_stack_entry = mpls_lse; |
6357 | skb_postpush_rcsum(skb, start: lse, MPLS_HLEN); |
6358 | |
6359 | if (ethernet && mac_len >= ETH_HLEN) |
6360 | skb_mod_eth_type(skb, hdr: eth_hdr(skb), ethertype: mpls_proto); |
6361 | skb->protocol = mpls_proto; |
6362 | |
6363 | return 0; |
6364 | } |
6365 | EXPORT_SYMBOL_GPL(skb_mpls_push); |
6366 | |
6367 | /** |
6368 | * skb_mpls_pop() - pop the outermost MPLS header |
6369 | * |
6370 | * @skb: buffer |
6371 | * @next_proto: ethertype of header after popped MPLS header |
6372 | * @mac_len: length of the MAC header |
6373 | * @ethernet: flag to indicate if the packet is ethernet |
6374 | * |
6375 | * Expects skb->data at mac header. |
6376 | * |
6377 | * Returns 0 on success, -errno otherwise. |
6378 | */ |
6379 | int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, |
6380 | bool ethernet) |
6381 | { |
6382 | int err; |
6383 | |
6384 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6385 | return 0; |
6386 | |
6387 | err = skb_ensure_writable(skb, mac_len + MPLS_HLEN); |
6388 | if (unlikely(err)) |
6389 | return err; |
6390 | |
6391 | skb_postpull_rcsum(skb, start: mpls_hdr(skb), MPLS_HLEN); |
6392 | memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), |
6393 | mac_len); |
6394 | |
6395 | __skb_pull(skb, MPLS_HLEN); |
6396 | skb_reset_mac_header(skb); |
6397 | skb_set_network_header(skb, offset: mac_len); |
6398 | |
6399 | if (ethernet && mac_len >= ETH_HLEN) { |
6400 | struct ethhdr *hdr; |
6401 | |
6402 | /* use mpls_hdr() to get ethertype to account for VLANs. */ |
6403 | hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN); |
6404 | skb_mod_eth_type(skb, hdr, ethertype: next_proto); |
6405 | } |
6406 | skb->protocol = next_proto; |
6407 | |
6408 | return 0; |
6409 | } |
6410 | EXPORT_SYMBOL_GPL(skb_mpls_pop); |
6411 | |
6412 | /** |
6413 | * skb_mpls_update_lse() - modify outermost MPLS header and update csum |
6414 | * |
6415 | * @skb: buffer |
6416 | * @mpls_lse: new MPLS label stack entry to update to |
6417 | * |
6418 | * Expects skb->data at mac header. |
6419 | * |
6420 | * Returns 0 on success, -errno otherwise. |
6421 | */ |
6422 | int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse) |
6423 | { |
6424 | int err; |
6425 | |
6426 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6427 | return -EINVAL; |
6428 | |
6429 | err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); |
6430 | if (unlikely(err)) |
6431 | return err; |
6432 | |
6433 | if (skb->ip_summed == CHECKSUM_COMPLETE) { |
6434 | __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse }; |
6435 | |
6436 | skb->csum = csum_partial(buff: (char *)diff, len: sizeof(diff), sum: skb->csum); |
6437 | } |
6438 | |
6439 | mpls_hdr(skb)->label_stack_entry = mpls_lse; |
6440 | |
6441 | return 0; |
6442 | } |
6443 | EXPORT_SYMBOL_GPL(skb_mpls_update_lse); |
6444 | |
6445 | /** |
6446 | * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header |
6447 | * |
6448 | * @skb: buffer |
6449 | * |
6450 | * Expects skb->data at mac header. |
6451 | * |
6452 | * Returns 0 on success, -errno otherwise. |
6453 | */ |
6454 | int skb_mpls_dec_ttl(struct sk_buff *skb) |
6455 | { |
6456 | u32 lse; |
6457 | u8 ttl; |
6458 | |
6459 | if (unlikely(!eth_p_mpls(skb->protocol))) |
6460 | return -EINVAL; |
6461 | |
6462 | if (!pskb_may_pull(skb, len: skb_network_offset(skb) + MPLS_HLEN)) |
6463 | return -ENOMEM; |
6464 | |
6465 | lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry); |
6466 | ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; |
6467 | if (!--ttl) |
6468 | return -EINVAL; |
6469 | |
6470 | lse &= ~MPLS_LS_TTL_MASK; |
6471 | lse |= ttl << MPLS_LS_TTL_SHIFT; |
6472 | |
6473 | return skb_mpls_update_lse(skb, cpu_to_be32(lse)); |
6474 | } |
6475 | EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); |
6476 | |
6477 | /** |
6478 | * alloc_skb_with_frags - allocate skb with page frags |
6479 | * |
6480 | * @header_len: size of linear part |
6481 | * @data_len: needed length in frags |
6482 | * @order: max page order desired. |
6483 | * @errcode: pointer to error code if any |
6484 | * @gfp_mask: allocation mask |
6485 | * |
6486 | * This can be used to allocate a paged skb, given a maximal order for frags. |
6487 | */ |
6488 | struct sk_buff *alloc_skb_with_frags(unsigned long , |
6489 | unsigned long data_len, |
6490 | int order, |
6491 | int *errcode, |
6492 | gfp_t gfp_mask) |
6493 | { |
6494 | unsigned long chunk; |
6495 | struct sk_buff *skb; |
6496 | struct page *page; |
6497 | int nr_frags = 0; |
6498 | |
6499 | *errcode = -EMSGSIZE; |
6500 | if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) |
6501 | return NULL; |
6502 | |
6503 | *errcode = -ENOBUFS; |
6504 | skb = alloc_skb(size: header_len, priority: gfp_mask); |
6505 | if (!skb) |
6506 | return NULL; |
6507 | |
6508 | while (data_len) { |
6509 | if (nr_frags == MAX_SKB_FRAGS - 1) |
6510 | goto failure; |
6511 | while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) |
6512 | order--; |
6513 | |
6514 | if (order) { |
6515 | page = alloc_pages(gfp: (gfp_mask & ~__GFP_DIRECT_RECLAIM) | |
6516 | __GFP_COMP | |
6517 | __GFP_NOWARN, |
6518 | order); |
6519 | if (!page) { |
6520 | order--; |
6521 | continue; |
6522 | } |
6523 | } else { |
6524 | page = alloc_page(gfp_mask); |
6525 | if (!page) |
6526 | goto failure; |
6527 | } |
6528 | chunk = min_t(unsigned long, data_len, |
6529 | PAGE_SIZE << order); |
6530 | skb_fill_page_desc(skb, i: nr_frags, page, off: 0, size: chunk); |
6531 | nr_frags++; |
6532 | skb->truesize += (PAGE_SIZE << order); |
6533 | data_len -= chunk; |
6534 | } |
6535 | return skb; |
6536 | |
6537 | failure: |
6538 | kfree_skb(skb); |
6539 | return NULL; |
6540 | } |
6541 | EXPORT_SYMBOL(alloc_skb_with_frags); |
6542 | |
6543 | /* carve out the first off bytes from skb when off < headlen */ |
6544 | static int (struct sk_buff *skb, const u32 off, |
6545 | const int headlen, gfp_t gfp_mask) |
6546 | { |
6547 | int i; |
6548 | unsigned int size = skb_end_offset(skb); |
6549 | int new_hlen = headlen - off; |
6550 | u8 *data; |
6551 | |
6552 | if (skb_pfmemalloc(skb)) |
6553 | gfp_mask |= __GFP_MEMALLOC; |
6554 | |
6555 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
6556 | if (!data) |
6557 | return -ENOMEM; |
6558 | size = SKB_WITH_OVERHEAD(size); |
6559 | |
6560 | /* Copy real data, and all frags */ |
6561 | skb_copy_from_linear_data_offset(skb, offset: off, to: data, len: new_hlen); |
6562 | skb->len -= off; |
6563 | |
6564 | memcpy((struct skb_shared_info *)(data + size), |
6565 | skb_shinfo(skb), |
6566 | offsetof(struct skb_shared_info, |
6567 | frags[skb_shinfo(skb)->nr_frags])); |
6568 | if (skb_cloned(skb)) { |
6569 | /* drop the old head gracefully */ |
6570 | if (skb_orphan_frags(skb, gfp_mask)) { |
6571 | skb_kfree_head(head: data, end_offset: size); |
6572 | return -ENOMEM; |
6573 | } |
6574 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) |
6575 | skb_frag_ref(skb, f: i); |
6576 | if (skb_has_frag_list(skb)) |
6577 | skb_clone_fraglist(skb); |
6578 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
6579 | } else { |
6580 | /* we can reuse existing recount- all we did was |
6581 | * relocate values |
6582 | */ |
6583 | skb_free_head(skb, napi_safe: false); |
6584 | } |
6585 | |
6586 | skb->head = data; |
6587 | skb->data = data; |
6588 | skb->head_frag = 0; |
6589 | skb_set_end_offset(skb, offset: size); |
6590 | skb_set_tail_pointer(skb, offset: skb_headlen(skb)); |
6591 | skb_headers_offset_update(skb, 0); |
6592 | skb->cloned = 0; |
6593 | skb->hdr_len = 0; |
6594 | skb->nohdr = 0; |
6595 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
6596 | |
6597 | return 0; |
6598 | } |
6599 | |
6600 | static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); |
6601 | |
6602 | /* carve out the first eat bytes from skb's frag_list. May recurse into |
6603 | * pskb_carve() |
6604 | */ |
6605 | static int pskb_carve_frag_list(struct sk_buff *skb, |
6606 | struct skb_shared_info *shinfo, int eat, |
6607 | gfp_t gfp_mask) |
6608 | { |
6609 | struct sk_buff *list = shinfo->frag_list; |
6610 | struct sk_buff *clone = NULL; |
6611 | struct sk_buff *insp = NULL; |
6612 | |
6613 | do { |
6614 | if (!list) { |
6615 | pr_err("Not enough bytes to eat. Want %d\n" , eat); |
6616 | return -EFAULT; |
6617 | } |
6618 | if (list->len <= eat) { |
6619 | /* Eaten as whole. */ |
6620 | eat -= list->len; |
6621 | list = list->next; |
6622 | insp = list; |
6623 | } else { |
6624 | /* Eaten partially. */ |
6625 | if (skb_shared(skb: list)) { |
6626 | clone = skb_clone(list, gfp_mask); |
6627 | if (!clone) |
6628 | return -ENOMEM; |
6629 | insp = list->next; |
6630 | list = clone; |
6631 | } else { |
6632 | /* This may be pulled without problems. */ |
6633 | insp = list; |
6634 | } |
6635 | if (pskb_carve(skb: list, off: eat, gfp: gfp_mask) < 0) { |
6636 | kfree_skb(skb: clone); |
6637 | return -ENOMEM; |
6638 | } |
6639 | break; |
6640 | } |
6641 | } while (eat); |
6642 | |
6643 | /* Free pulled out fragments. */ |
6644 | while ((list = shinfo->frag_list) != insp) { |
6645 | shinfo->frag_list = list->next; |
6646 | consume_skb(list); |
6647 | } |
6648 | /* And insert new clone at head. */ |
6649 | if (clone) { |
6650 | clone->next = list; |
6651 | shinfo->frag_list = clone; |
6652 | } |
6653 | return 0; |
6654 | } |
6655 | |
6656 | /* carve off first len bytes from skb. Split line (off) is in the |
6657 | * non-linear part of skb |
6658 | */ |
6659 | static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, |
6660 | int pos, gfp_t gfp_mask) |
6661 | { |
6662 | int i, k = 0; |
6663 | unsigned int size = skb_end_offset(skb); |
6664 | u8 *data; |
6665 | const int nfrags = skb_shinfo(skb)->nr_frags; |
6666 | struct skb_shared_info *shinfo; |
6667 | |
6668 | if (skb_pfmemalloc(skb)) |
6669 | gfp_mask |= __GFP_MEMALLOC; |
6670 | |
6671 | data = kmalloc_reserve(size: &size, flags: gfp_mask, NUMA_NO_NODE, NULL); |
6672 | if (!data) |
6673 | return -ENOMEM; |
6674 | size = SKB_WITH_OVERHEAD(size); |
6675 | |
6676 | memcpy((struct skb_shared_info *)(data + size), |
6677 | skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); |
6678 | if (skb_orphan_frags(skb, gfp_mask)) { |
6679 | skb_kfree_head(head: data, end_offset: size); |
6680 | return -ENOMEM; |
6681 | } |
6682 | shinfo = (struct skb_shared_info *)(data + size); |
6683 | for (i = 0; i < nfrags; i++) { |
6684 | int fsize = skb_frag_size(frag: &skb_shinfo(skb)->frags[i]); |
6685 | |
6686 | if (pos + fsize > off) { |
6687 | shinfo->frags[k] = skb_shinfo(skb)->frags[i]; |
6688 | |
6689 | if (pos < off) { |
6690 | /* Split frag. |
6691 | * We have two variants in this case: |
6692 | * 1. Move all the frag to the second |
6693 | * part, if it is possible. F.e. |
6694 | * this approach is mandatory for TUX, |
6695 | * where splitting is expensive. |
6696 | * 2. Split is accurately. We make this. |
6697 | */ |
6698 | skb_frag_off_add(frag: &shinfo->frags[0], delta: off - pos); |
6699 | skb_frag_size_sub(frag: &shinfo->frags[0], delta: off - pos); |
6700 | } |
6701 | skb_frag_ref(skb, f: i); |
6702 | k++; |
6703 | } |
6704 | pos += fsize; |
6705 | } |
6706 | shinfo->nr_frags = k; |
6707 | if (skb_has_frag_list(skb)) |
6708 | skb_clone_fraglist(skb); |
6709 | |
6710 | /* split line is in frag list */ |
6711 | if (k == 0 && pskb_carve_frag_list(skb, shinfo, eat: off - pos, gfp_mask)) { |
6712 | /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */ |
6713 | if (skb_has_frag_list(skb)) |
6714 | kfree_skb_list(skb_shinfo(skb)->frag_list); |
6715 | skb_kfree_head(head: data, end_offset: size); |
6716 | return -ENOMEM; |
6717 | } |
6718 | skb_release_data(skb, reason: SKB_CONSUMED, napi_safe: false); |
6719 | |
6720 | skb->head = data; |
6721 | skb->head_frag = 0; |
6722 | skb->data = data; |
6723 | skb_set_end_offset(skb, offset: size); |
6724 | skb_reset_tail_pointer(skb); |
6725 | skb_headers_offset_update(skb, 0); |
6726 | skb->cloned = 0; |
6727 | skb->hdr_len = 0; |
6728 | skb->nohdr = 0; |
6729 | skb->len -= off; |
6730 | skb->data_len = skb->len; |
6731 | atomic_set(v: &skb_shinfo(skb)->dataref, i: 1); |
6732 | return 0; |
6733 | } |
6734 | |
6735 | /* remove len bytes from the beginning of the skb */ |
6736 | static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) |
6737 | { |
6738 | int headlen = skb_headlen(skb); |
6739 | |
6740 | if (len < headlen) |
6741 | return pskb_carve_inside_header(skb, off: len, headlen, gfp_mask: gfp); |
6742 | else |
6743 | return pskb_carve_inside_nonlinear(skb, off: len, pos: headlen, gfp_mask: gfp); |
6744 | } |
6745 | |
6746 | /* Extract to_copy bytes starting at off from skb, and return this in |
6747 | * a new skb |
6748 | */ |
6749 | struct sk_buff *(struct sk_buff *skb, int off, |
6750 | int to_copy, gfp_t gfp) |
6751 | { |
6752 | struct sk_buff *clone = skb_clone(skb, gfp); |
6753 | |
6754 | if (!clone) |
6755 | return NULL; |
6756 | |
6757 | if (pskb_carve(skb: clone, len: off, gfp) < 0 || |
6758 | pskb_trim(skb: clone, len: to_copy)) { |
6759 | kfree_skb(skb: clone); |
6760 | return NULL; |
6761 | } |
6762 | return clone; |
6763 | } |
6764 | EXPORT_SYMBOL(pskb_extract); |
6765 | |
6766 | /** |
6767 | * skb_condense - try to get rid of fragments/frag_list if possible |
6768 | * @skb: buffer |
6769 | * |
6770 | * Can be used to save memory before skb is added to a busy queue. |
6771 | * If packet has bytes in frags and enough tail room in skb->head, |
6772 | * pull all of them, so that we can free the frags right now and adjust |
6773 | * truesize. |
6774 | * Notes: |
6775 | * We do not reallocate skb->head thus can not fail. |
6776 | * Caller must re-evaluate skb->truesize if needed. |
6777 | */ |
6778 | void skb_condense(struct sk_buff *skb) |
6779 | { |
6780 | if (skb->data_len) { |
6781 | if (skb->data_len > skb->end - skb->tail || |
6782 | skb_cloned(skb)) |
6783 | return; |
6784 | |
6785 | /* Nice, we can free page frag(s) right now */ |
6786 | __pskb_pull_tail(skb, skb->data_len); |
6787 | } |
6788 | /* At this point, skb->truesize might be over estimated, |
6789 | * because skb had a fragment, and fragments do not tell |
6790 | * their truesize. |
6791 | * When we pulled its content into skb->head, fragment |
6792 | * was freed, but __pskb_pull_tail() could not possibly |
6793 | * adjust skb->truesize, not knowing the frag truesize. |
6794 | */ |
6795 | skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); |
6796 | } |
6797 | EXPORT_SYMBOL(skb_condense); |
6798 | |
6799 | #ifdef CONFIG_SKB_EXTENSIONS |
6800 | static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) |
6801 | { |
6802 | return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE); |
6803 | } |
6804 | |
6805 | /** |
6806 | * __skb_ext_alloc - allocate a new skb extensions storage |
6807 | * |
6808 | * @flags: See kmalloc(). |
6809 | * |
6810 | * Returns the newly allocated pointer. The pointer can later attached to a |
6811 | * skb via __skb_ext_set(). |
6812 | * Note: caller must handle the skb_ext as an opaque data. |
6813 | */ |
6814 | struct skb_ext *__skb_ext_alloc(gfp_t flags) |
6815 | { |
6816 | struct skb_ext *new = kmem_cache_alloc(cachep: skbuff_ext_cache, flags); |
6817 | |
6818 | if (new) { |
6819 | memset(new->offset, 0, sizeof(new->offset)); |
6820 | refcount_set(r: &new->refcnt, n: 1); |
6821 | } |
6822 | |
6823 | return new; |
6824 | } |
6825 | |
6826 | static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old, |
6827 | unsigned int old_active) |
6828 | { |
6829 | struct skb_ext *new; |
6830 | |
6831 | if (refcount_read(r: &old->refcnt) == 1) |
6832 | return old; |
6833 | |
6834 | new = kmem_cache_alloc(cachep: skbuff_ext_cache, GFP_ATOMIC); |
6835 | if (!new) |
6836 | return NULL; |
6837 | |
6838 | memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE); |
6839 | refcount_set(r: &new->refcnt, n: 1); |
6840 | |
6841 | #ifdef CONFIG_XFRM |
6842 | if (old_active & (1 << SKB_EXT_SEC_PATH)) { |
6843 | struct sec_path *sp = skb_ext_get_ptr(ext: old, id: SKB_EXT_SEC_PATH); |
6844 | unsigned int i; |
6845 | |
6846 | for (i = 0; i < sp->len; i++) |
6847 | xfrm_state_hold(x: sp->xvec[i]); |
6848 | } |
6849 | #endif |
6850 | #ifdef CONFIG_MCTP_FLOWS |
6851 | if (old_active & (1 << SKB_EXT_MCTP)) { |
6852 | struct mctp_flow *flow = skb_ext_get_ptr(ext: old, id: SKB_EXT_MCTP); |
6853 | |
6854 | if (flow->key) |
6855 | refcount_inc(r: &flow->key->refs); |
6856 | } |
6857 | #endif |
6858 | __skb_ext_put(ext: old); |
6859 | return new; |
6860 | } |
6861 | |
6862 | /** |
6863 | * __skb_ext_set - attach the specified extension storage to this skb |
6864 | * @skb: buffer |
6865 | * @id: extension id |
6866 | * @ext: extension storage previously allocated via __skb_ext_alloc() |
6867 | * |
6868 | * Existing extensions, if any, are cleared. |
6869 | * |
6870 | * Returns the pointer to the extension. |
6871 | */ |
6872 | void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, |
6873 | struct skb_ext *ext) |
6874 | { |
6875 | unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext); |
6876 | |
6877 | skb_ext_put(skb); |
6878 | newlen = newoff + skb_ext_type_len[id]; |
6879 | ext->chunks = newlen; |
6880 | ext->offset[id] = newoff; |
6881 | skb->extensions = ext; |
6882 | skb->active_extensions = 1 << id; |
6883 | return skb_ext_get_ptr(ext, id); |
6884 | } |
6885 | |
6886 | /** |
6887 | * skb_ext_add - allocate space for given extension, COW if needed |
6888 | * @skb: buffer |
6889 | * @id: extension to allocate space for |
6890 | * |
6891 | * Allocates enough space for the given extension. |
6892 | * If the extension is already present, a pointer to that extension |
6893 | * is returned. |
6894 | * |
6895 | * If the skb was cloned, COW applies and the returned memory can be |
6896 | * modified without changing the extension space of clones buffers. |
6897 | * |
6898 | * Returns pointer to the extension or NULL on allocation failure. |
6899 | */ |
6900 | void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) |
6901 | { |
6902 | struct skb_ext *new, *old = NULL; |
6903 | unsigned int newlen, newoff; |
6904 | |
6905 | if (skb->active_extensions) { |
6906 | old = skb->extensions; |
6907 | |
6908 | new = skb_ext_maybe_cow(old, old_active: skb->active_extensions); |
6909 | if (!new) |
6910 | return NULL; |
6911 | |
6912 | if (__skb_ext_exist(ext: new, i: id)) |
6913 | goto set_active; |
6914 | |
6915 | newoff = new->chunks; |
6916 | } else { |
6917 | newoff = SKB_EXT_CHUNKSIZEOF(*new); |
6918 | |
6919 | new = __skb_ext_alloc(GFP_ATOMIC); |
6920 | if (!new) |
6921 | return NULL; |
6922 | } |
6923 | |
6924 | newlen = newoff + skb_ext_type_len[id]; |
6925 | new->chunks = newlen; |
6926 | new->offset[id] = newoff; |
6927 | set_active: |
6928 | skb->slow_gro = 1; |
6929 | skb->extensions = new; |
6930 | skb->active_extensions |= 1 << id; |
6931 | return skb_ext_get_ptr(ext: new, id); |
6932 | } |
6933 | EXPORT_SYMBOL(skb_ext_add); |
6934 | |
6935 | #ifdef CONFIG_XFRM |
6936 | static void skb_ext_put_sp(struct sec_path *sp) |
6937 | { |
6938 | unsigned int i; |
6939 | |
6940 | for (i = 0; i < sp->len; i++) |
6941 | xfrm_state_put(x: sp->xvec[i]); |
6942 | } |
6943 | #endif |
6944 | |
6945 | #ifdef CONFIG_MCTP_FLOWS |
6946 | static void skb_ext_put_mctp(struct mctp_flow *flow) |
6947 | { |
6948 | if (flow->key) |
6949 | mctp_key_unref(key: flow->key); |
6950 | } |
6951 | #endif |
6952 | |
6953 | void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) |
6954 | { |
6955 | struct skb_ext *ext = skb->extensions; |
6956 | |
6957 | skb->active_extensions &= ~(1 << id); |
6958 | if (skb->active_extensions == 0) { |
6959 | skb->extensions = NULL; |
6960 | __skb_ext_put(ext); |
6961 | #ifdef CONFIG_XFRM |
6962 | } else if (id == SKB_EXT_SEC_PATH && |
6963 | refcount_read(r: &ext->refcnt) == 1) { |
6964 | struct sec_path *sp = skb_ext_get_ptr(ext, id: SKB_EXT_SEC_PATH); |
6965 | |
6966 | skb_ext_put_sp(sp); |
6967 | sp->len = 0; |
6968 | #endif |
6969 | } |
6970 | } |
6971 | EXPORT_SYMBOL(__skb_ext_del); |
6972 | |
6973 | void __skb_ext_put(struct skb_ext *ext) |
6974 | { |
6975 | /* If this is last clone, nothing can increment |
6976 | * it after check passes. Avoids one atomic op. |
6977 | */ |
6978 | if (refcount_read(r: &ext->refcnt) == 1) |
6979 | goto free_now; |
6980 | |
6981 | if (!refcount_dec_and_test(r: &ext->refcnt)) |
6982 | return; |
6983 | free_now: |
6984 | #ifdef CONFIG_XFRM |
6985 | if (__skb_ext_exist(ext, i: SKB_EXT_SEC_PATH)) |
6986 | skb_ext_put_sp(sp: skb_ext_get_ptr(ext, id: SKB_EXT_SEC_PATH)); |
6987 | #endif |
6988 | #ifdef CONFIG_MCTP_FLOWS |
6989 | if (__skb_ext_exist(ext, i: SKB_EXT_MCTP)) |
6990 | skb_ext_put_mctp(flow: skb_ext_get_ptr(ext, id: SKB_EXT_MCTP)); |
6991 | #endif |
6992 | |
6993 | kmem_cache_free(s: skbuff_ext_cache, objp: ext); |
6994 | } |
6995 | EXPORT_SYMBOL(__skb_ext_put); |
6996 | #endif /* CONFIG_SKB_EXTENSIONS */ |
6997 | |
6998 | /** |
6999 | * skb_attempt_defer_free - queue skb for remote freeing |
7000 | * @skb: buffer |
7001 | * |
7002 | * Put @skb in a per-cpu list, using the cpu which |
7003 | * allocated the skb/pages to reduce false sharing |
7004 | * and memory zone spinlock contention. |
7005 | */ |
7006 | void skb_attempt_defer_free(struct sk_buff *skb) |
7007 | { |
7008 | int cpu = skb->alloc_cpu; |
7009 | struct softnet_data *sd; |
7010 | unsigned int defer_max; |
7011 | bool kick; |
7012 | |
7013 | if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || |
7014 | !cpu_online(cpu) || |
7015 | cpu == raw_smp_processor_id()) { |
7016 | nodefer: __kfree_skb(skb); |
7017 | return; |
7018 | } |
7019 | |
7020 | DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); |
7021 | DEBUG_NET_WARN_ON_ONCE(skb->destructor); |
7022 | |
7023 | sd = &per_cpu(softnet_data, cpu); |
7024 | defer_max = READ_ONCE(sysctl_skb_defer_max); |
7025 | if (READ_ONCE(sd->defer_count) >= defer_max) |
7026 | goto nodefer; |
7027 | |
7028 | spin_lock_bh(lock: &sd->defer_lock); |
7029 | /* Send an IPI every time queue reaches half capacity. */ |
7030 | kick = sd->defer_count == (defer_max >> 1); |
7031 | /* Paired with the READ_ONCE() few lines above */ |
7032 | WRITE_ONCE(sd->defer_count, sd->defer_count + 1); |
7033 | |
7034 | skb->next = sd->defer_list; |
7035 | /* Paired with READ_ONCE() in skb_defer_free_flush() */ |
7036 | WRITE_ONCE(sd->defer_list, skb); |
7037 | spin_unlock_bh(lock: &sd->defer_lock); |
7038 | |
7039 | /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU |
7040 | * if we are unlucky enough (this seems very unlikely). |
7041 | */ |
7042 | if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) |
7043 | smp_call_function_single_async(cpu, csd: &sd->defer_csd); |
7044 | } |
7045 | |
7046 | static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, |
7047 | size_t offset, size_t len) |
7048 | { |
7049 | const char *kaddr; |
7050 | __wsum csum; |
7051 | |
7052 | kaddr = kmap_local_page(page); |
7053 | csum = csum_partial(buff: kaddr + offset, len, sum: 0); |
7054 | kunmap_local(kaddr); |
7055 | skb->csum = csum_block_add(csum: skb->csum, csum2: csum, offset: skb->len); |
7056 | } |
7057 | |
7058 | /** |
7059 | * skb_splice_from_iter - Splice (or copy) pages to skbuff |
7060 | * @skb: The buffer to add pages to |
7061 | * @iter: Iterator representing the pages to be added |
7062 | * @maxsize: Maximum amount of pages to be added |
7063 | * @gfp: Allocation flags |
7064 | * |
7065 | * This is a common helper function for supporting MSG_SPLICE_PAGES. It |
7066 | * extracts pages from an iterator and adds them to the socket buffer if |
7067 | * possible, copying them to fragments if not possible (such as if they're slab |
7068 | * pages). |
7069 | * |
7070 | * Returns the amount of data spliced/copied or -EMSGSIZE if there's |
7071 | * insufficient space in the buffer to transfer anything. |
7072 | */ |
7073 | ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, |
7074 | ssize_t maxsize, gfp_t gfp) |
7075 | { |
7076 | size_t frag_limit = READ_ONCE(sysctl_max_skb_frags); |
7077 | struct page *pages[8], **ppages = pages; |
7078 | ssize_t spliced = 0, ret = 0; |
7079 | unsigned int i; |
7080 | |
7081 | while (iter->count > 0) { |
7082 | ssize_t space, nr, len; |
7083 | size_t off; |
7084 | |
7085 | ret = -EMSGSIZE; |
7086 | space = frag_limit - skb_shinfo(skb)->nr_frags; |
7087 | if (space < 0) |
7088 | break; |
7089 | |
7090 | /* We might be able to coalesce without increasing nr_frags */ |
7091 | nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages)); |
7092 | |
7093 | len = iov_iter_extract_pages(i: iter, pages: &ppages, maxsize, maxpages: nr, extraction_flags: 0, offset0: &off); |
7094 | if (len <= 0) { |
7095 | ret = len ?: -EIO; |
7096 | break; |
7097 | } |
7098 | |
7099 | i = 0; |
7100 | do { |
7101 | struct page *page = pages[i++]; |
7102 | size_t part = min_t(size_t, PAGE_SIZE - off, len); |
7103 | |
7104 | ret = -EIO; |
7105 | if (WARN_ON_ONCE(!sendpage_ok(page))) |
7106 | goto out; |
7107 | |
7108 | ret = skb_append_pagefrags(skb, page, off, part, |
7109 | frag_limit); |
7110 | if (ret < 0) { |
7111 | iov_iter_revert(i: iter, bytes: len); |
7112 | goto out; |
7113 | } |
7114 | |
7115 | if (skb->ip_summed == CHECKSUM_NONE) |
7116 | skb_splice_csum_page(skb, page, offset: off, len: part); |
7117 | |
7118 | off = 0; |
7119 | spliced += part; |
7120 | maxsize -= part; |
7121 | len -= part; |
7122 | } while (len > 0); |
7123 | |
7124 | if (maxsize <= 0) |
7125 | break; |
7126 | } |
7127 | |
7128 | out: |
7129 | skb_len_add(skb, delta: spliced); |
7130 | return spliced ?: ret; |
7131 | } |
7132 | EXPORT_SYMBOL(skb_splice_from_iter); |
7133 | |
7134 | static __always_inline |
7135 | size_t memcpy_from_iter_csum(void *iter_from, size_t progress, |
7136 | size_t len, void *to, void *priv2) |
7137 | { |
7138 | __wsum *csum = priv2; |
7139 | __wsum next = csum_partial_copy_nocheck(src: iter_from, dst: to + progress, len); |
7140 | |
7141 | *csum = csum_block_add(csum: *csum, csum2: next, offset: progress); |
7142 | return 0; |
7143 | } |
7144 | |
7145 | static __always_inline |
7146 | size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress, |
7147 | size_t len, void *to, void *priv2) |
7148 | { |
7149 | __wsum next, *csum = priv2; |
7150 | |
7151 | next = csum_and_copy_from_user(src: iter_from, dst: to + progress, len); |
7152 | *csum = csum_block_add(csum: *csum, csum2: next, offset: progress); |
7153 | return next ? 0 : len; |
7154 | } |
7155 | |
7156 | bool csum_and_copy_from_iter_full(void *addr, size_t bytes, |
7157 | __wsum *csum, struct iov_iter *i) |
7158 | { |
7159 | size_t copied; |
7160 | |
7161 | if (WARN_ON_ONCE(!i->data_source)) |
7162 | return false; |
7163 | copied = iterate_and_advance2(iter: i, len: bytes, priv: addr, priv2: csum, |
7164 | ustep: copy_from_user_iter_csum, |
7165 | step: memcpy_from_iter_csum); |
7166 | if (likely(copied == bytes)) |
7167 | return true; |
7168 | iov_iter_revert(i, bytes: copied); |
7169 | return false; |
7170 | } |
7171 | EXPORT_SYMBOL(csum_and_copy_from_iter_full); |
7172 | |