1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * NET3 Protocol independent device support routines.
4 *
5 * Derived from the non IP parts of dev.c 1.0.19
6 * Authors: Ross Biro
7 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 *
10 * Additional Authors:
11 * Florian la Roche <rzsfl@rz.uni-sb.de>
12 * Alan Cox <gw4pts@gw4pts.ampr.org>
13 * David Hinds <dahinds@users.sourceforge.net>
14 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 * Adam Sulmicki <adam@cfar.umd.edu>
16 * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 *
18 * Changes:
19 * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 * to 2 if register_netdev gets called
21 * before net_dev_init & also removed a
22 * few lines of code in the process.
23 * Alan Cox : device private ioctl copies fields back.
24 * Alan Cox : Transmit queue code does relevant
25 * stunts to keep the queue safe.
26 * Alan Cox : Fixed double lock.
27 * Alan Cox : Fixed promisc NULL pointer trap
28 * ???????? : Support the full private ioctl range
29 * Alan Cox : Moved ioctl permission check into
30 * drivers
31 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 * Alan Cox : 100 backlog just doesn't cut it when
33 * you start doing multicast video 8)
34 * Alan Cox : Rewrote net_bh and list manager.
35 * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 * Alan Cox : Took out transmit every packet pass
37 * Saved a few bytes in the ioctl handler
38 * Alan Cox : Network driver sets packet type before
39 * calling netif_rx. Saves a function
40 * call a packet.
41 * Alan Cox : Hashed net_bh()
42 * Richard Kooijman: Timestamp fixes.
43 * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 * Alan Cox : Device lock protection.
45 * Alan Cox : Fixed nasty side effect of device close
46 * changes.
47 * Rudi Cilibrasi : Pass the right thing to
48 * set_mac_address()
49 * Dave Miller : 32bit quantity for the device lock to
50 * make it work out on a Sparc.
51 * Bjorn Ekwall : Added KERNELD hack.
52 * Alan Cox : Cleaned up the backlog initialise.
53 * Craig Metz : SIOCGIFCONF fix if space for under
54 * 1 device.
55 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 * is no device open function.
57 * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 * Cyrus Durgin : Cleaned for KMOD
60 * Adam Sulmicki : Bug Fix : Network Device Unload
61 * A network device unload needs to purge
62 * the backlog queue.
63 * Paul Rusty Russell : SIOCSIFNAME
64 * Pekka Riikonen : Netdev boot-time settings code
65 * Andrew Morton : Make unregister_netdevice wait
66 * indefinitely on dev->refcnt
67 * J Hadi Salim : - Backlog queue sampling
68 * - netif_rx() feedback
69 */
70
71#include <linux/uaccess.h>
72#include <linux/bitmap.h>
73#include <linux/capability.h>
74#include <linux/cpu.h>
75#include <linux/types.h>
76#include <linux/kernel.h>
77#include <linux/hash.h>
78#include <linux/slab.h>
79#include <linux/sched.h>
80#include <linux/sched/mm.h>
81#include <linux/mutex.h>
82#include <linux/rwsem.h>
83#include <linux/string.h>
84#include <linux/mm.h>
85#include <linux/socket.h>
86#include <linux/sockios.h>
87#include <linux/errno.h>
88#include <linux/interrupt.h>
89#include <linux/if_ether.h>
90#include <linux/netdevice.h>
91#include <linux/etherdevice.h>
92#include <linux/ethtool.h>
93#include <linux/skbuff.h>
94#include <linux/kthread.h>
95#include <linux/bpf.h>
96#include <linux/bpf_trace.h>
97#include <net/net_namespace.h>
98#include <net/sock.h>
99#include <net/busy_poll.h>
100#include <linux/rtnetlink.h>
101#include <linux/stat.h>
102#include <net/dsa.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/gro.h>
106#include <net/pkt_sched.h>
107#include <net/pkt_cls.h>
108#include <net/checksum.h>
109#include <net/xfrm.h>
110#include <net/tcx.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <trace/events/qdisc.h>
136#include <trace/events/xdp.h>
137#include <linux/inetdevice.h>
138#include <linux/cpu_rmap.h>
139#include <linux/static_key.h>
140#include <linux/hashtable.h>
141#include <linux/vmalloc.h>
142#include <linux/if_macvlan.h>
143#include <linux/errqueue.h>
144#include <linux/hrtimer.h>
145#include <linux/netfilter_netdev.h>
146#include <linux/crash_dump.h>
147#include <linux/sctp.h>
148#include <net/udp_tunnel.h>
149#include <linux/net_namespace.h>
150#include <linux/indirect_call_wrapper.h>
151#include <net/devlink.h>
152#include <linux/pm_runtime.h>
153#include <linux/prandom.h>
154#include <linux/once_lite.h>
155#include <net/netdev_rx_queue.h>
156#include <net/page_pool/types.h>
157#include <net/page_pool/helpers.h>
158#include <net/rps.h>
159
160#include "dev.h"
161#include "net-sysfs.h"
162
163static DEFINE_SPINLOCK(ptype_lock);
164struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
165
166static int netif_rx_internal(struct sk_buff *skb);
167static int call_netdevice_notifiers_extack(unsigned long val,
168 struct net_device *dev,
169 struct netlink_ext_ack *extack);
170
171static DEFINE_MUTEX(ifalias_mutex);
172
173/* protects napi_hash addition/deletion and napi_gen_id */
174static DEFINE_SPINLOCK(napi_hash_lock);
175
176static unsigned int napi_gen_id = NR_CPUS;
177static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
178
179static DECLARE_RWSEM(devnet_rename_sem);
180
181static inline void dev_base_seq_inc(struct net *net)
182{
183 unsigned int val = net->dev_base_seq + 1;
184
185 WRITE_ONCE(net->dev_base_seq, val ?: 1);
186}
187
188static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
189{
190 unsigned int hash = full_name_hash(salt: net, name, strnlen(p: name, IFNAMSIZ));
191
192 return &net->dev_name_head[hash_32(val: hash, NETDEV_HASHBITS)];
193}
194
195static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
196{
197 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
198}
199
200static inline void rps_lock_irqsave(struct softnet_data *sd,
201 unsigned long *flags)
202{
203 if (IS_ENABLED(CONFIG_RPS))
204 spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
205 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
206 local_irq_save(*flags);
207}
208
209static inline void rps_lock_irq_disable(struct softnet_data *sd)
210{
211 if (IS_ENABLED(CONFIG_RPS))
212 spin_lock_irq(lock: &sd->input_pkt_queue.lock);
213 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
214 local_irq_disable();
215}
216
217static inline void rps_unlock_irq_restore(struct softnet_data *sd,
218 unsigned long *flags)
219{
220 if (IS_ENABLED(CONFIG_RPS))
221 spin_unlock_irqrestore(lock: &sd->input_pkt_queue.lock, flags: *flags);
222 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
223 local_irq_restore(*flags);
224}
225
226static inline void rps_unlock_irq_enable(struct softnet_data *sd)
227{
228 if (IS_ENABLED(CONFIG_RPS))
229 spin_unlock_irq(lock: &sd->input_pkt_queue.lock);
230 else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
231 local_irq_enable();
232}
233
234static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
235 const char *name)
236{
237 struct netdev_name_node *name_node;
238
239 name_node = kmalloc(size: sizeof(*name_node), GFP_KERNEL);
240 if (!name_node)
241 return NULL;
242 INIT_HLIST_NODE(h: &name_node->hlist);
243 name_node->dev = dev;
244 name_node->name = name;
245 return name_node;
246}
247
248static struct netdev_name_node *
249netdev_name_node_head_alloc(struct net_device *dev)
250{
251 struct netdev_name_node *name_node;
252
253 name_node = netdev_name_node_alloc(dev, name: dev->name);
254 if (!name_node)
255 return NULL;
256 INIT_LIST_HEAD(list: &name_node->list);
257 return name_node;
258}
259
260static void netdev_name_node_free(struct netdev_name_node *name_node)
261{
262 kfree(objp: name_node);
263}
264
265static void netdev_name_node_add(struct net *net,
266 struct netdev_name_node *name_node)
267{
268 hlist_add_head_rcu(n: &name_node->hlist,
269 h: dev_name_hash(net, name: name_node->name));
270}
271
272static void netdev_name_node_del(struct netdev_name_node *name_node)
273{
274 hlist_del_rcu(n: &name_node->hlist);
275}
276
277static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
278 const char *name)
279{
280 struct hlist_head *head = dev_name_hash(net, name);
281 struct netdev_name_node *name_node;
282
283 hlist_for_each_entry(name_node, head, hlist)
284 if (!strcmp(name_node->name, name))
285 return name_node;
286 return NULL;
287}
288
289static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
290 const char *name)
291{
292 struct hlist_head *head = dev_name_hash(net, name);
293 struct netdev_name_node *name_node;
294
295 hlist_for_each_entry_rcu(name_node, head, hlist)
296 if (!strcmp(name_node->name, name))
297 return name_node;
298 return NULL;
299}
300
301bool netdev_name_in_use(struct net *net, const char *name)
302{
303 return netdev_name_node_lookup(net, name);
304}
305EXPORT_SYMBOL(netdev_name_in_use);
306
307int netdev_name_node_alt_create(struct net_device *dev, const char *name)
308{
309 struct netdev_name_node *name_node;
310 struct net *net = dev_net(dev);
311
312 name_node = netdev_name_node_lookup(net, name);
313 if (name_node)
314 return -EEXIST;
315 name_node = netdev_name_node_alloc(dev, name);
316 if (!name_node)
317 return -ENOMEM;
318 netdev_name_node_add(net, name_node);
319 /* The node that holds dev->name acts as a head of per-device list. */
320 list_add_tail_rcu(new: &name_node->list, head: &dev->name_node->list);
321
322 return 0;
323}
324
325static void netdev_name_node_alt_free(struct rcu_head *head)
326{
327 struct netdev_name_node *name_node =
328 container_of(head, struct netdev_name_node, rcu);
329
330 kfree(objp: name_node->name);
331 netdev_name_node_free(name_node);
332}
333
334static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
335{
336 netdev_name_node_del(name_node);
337 list_del(entry: &name_node->list);
338 call_rcu(head: &name_node->rcu, func: netdev_name_node_alt_free);
339}
340
341int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
342{
343 struct netdev_name_node *name_node;
344 struct net *net = dev_net(dev);
345
346 name_node = netdev_name_node_lookup(net, name);
347 if (!name_node)
348 return -ENOENT;
349 /* lookup might have found our primary name or a name belonging
350 * to another device.
351 */
352 if (name_node == dev->name_node || name_node->dev != dev)
353 return -EINVAL;
354
355 __netdev_name_node_alt_destroy(name_node);
356 return 0;
357}
358
359static void netdev_name_node_alt_flush(struct net_device *dev)
360{
361 struct netdev_name_node *name_node, *tmp;
362
363 list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
364 list_del(entry: &name_node->list);
365 netdev_name_node_alt_free(head: &name_node->rcu);
366 }
367}
368
369/* Device list insertion */
370static void list_netdevice(struct net_device *dev)
371{
372 struct netdev_name_node *name_node;
373 struct net *net = dev_net(dev);
374
375 ASSERT_RTNL();
376
377 list_add_tail_rcu(new: &dev->dev_list, head: &net->dev_base_head);
378 netdev_name_node_add(net, name_node: dev->name_node);
379 hlist_add_head_rcu(n: &dev->index_hlist,
380 h: dev_index_hash(net, ifindex: dev->ifindex));
381
382 netdev_for_each_altname(dev, name_node)
383 netdev_name_node_add(net, name_node);
384
385 /* We reserved the ifindex, this can't fail */
386 WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
387
388 dev_base_seq_inc(net);
389}
390
391/* Device list removal
392 * caller must respect a RCU grace period before freeing/reusing dev
393 */
394static void unlist_netdevice(struct net_device *dev)
395{
396 struct netdev_name_node *name_node;
397 struct net *net = dev_net(dev);
398
399 ASSERT_RTNL();
400
401 xa_erase(&net->dev_by_index, index: dev->ifindex);
402
403 netdev_for_each_altname(dev, name_node)
404 netdev_name_node_del(name_node);
405
406 /* Unlink dev from the device chain */
407 list_del_rcu(entry: &dev->dev_list);
408 netdev_name_node_del(name_node: dev->name_node);
409 hlist_del_rcu(n: &dev->index_hlist);
410
411 dev_base_seq_inc(net: dev_net(dev));
412}
413
414/*
415 * Our notifier list
416 */
417
418static RAW_NOTIFIER_HEAD(netdev_chain);
419
420/*
421 * Device drivers call our routines to queue packets here. We empty the
422 * queue in the local softnet handler.
423 */
424
425DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
426EXPORT_PER_CPU_SYMBOL(softnet_data);
427
428/* Page_pool has a lockless array/stack to alloc/recycle pages.
429 * PP consumers must pay attention to run APIs in the appropriate context
430 * (e.g. NAPI context).
431 */
432static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
433
434#ifdef CONFIG_LOCKDEP
435/*
436 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
437 * according to dev->type
438 */
439static const unsigned short netdev_lock_type[] = {
440 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
441 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
442 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
443 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
444 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
445 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
446 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
447 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
448 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
449 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
450 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
451 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
452 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
453 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
454 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
455
456static const char *const netdev_lock_name[] = {
457 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
458 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
459 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
460 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
461 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
462 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
463 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
464 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
465 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
466 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
467 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
468 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
469 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
470 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
471 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
472
473static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
474static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
475
476static inline unsigned short netdev_lock_pos(unsigned short dev_type)
477{
478 int i;
479
480 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
481 if (netdev_lock_type[i] == dev_type)
482 return i;
483 /* the last key is used by default */
484 return ARRAY_SIZE(netdev_lock_type) - 1;
485}
486
487static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
488 unsigned short dev_type)
489{
490 int i;
491
492 i = netdev_lock_pos(dev_type);
493 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
494 netdev_lock_name[i]);
495}
496
497static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
498{
499 int i;
500
501 i = netdev_lock_pos(dev_type: dev->type);
502 lockdep_set_class_and_name(&dev->addr_list_lock,
503 &netdev_addr_lock_key[i],
504 netdev_lock_name[i]);
505}
506#else
507static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
508 unsigned short dev_type)
509{
510}
511
512static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
513{
514}
515#endif
516
517/*******************************************************************************
518 *
519 * Protocol management and registration routines
520 *
521 *******************************************************************************/
522
523
524/*
525 * Add a protocol ID to the list. Now that the input handler is
526 * smarter we can dispense with all the messy stuff that used to be
527 * here.
528 *
529 * BEWARE!!! Protocol handlers, mangling input packets,
530 * MUST BE last in hash buckets and checking protocol handlers
531 * MUST start from promiscuous ptype_all chain in net_bh.
532 * It is true now, do not change it.
533 * Explanation follows: if protocol handler, mangling packet, will
534 * be the first on list, it is not able to sense, that packet
535 * is cloned and should be copied-on-write, so that it will
536 * change it and subsequent readers will get broken packet.
537 * --ANK (980803)
538 */
539
540static inline struct list_head *ptype_head(const struct packet_type *pt)
541{
542 if (pt->type == htons(ETH_P_ALL))
543 return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
544 else
545 return pt->dev ? &pt->dev->ptype_specific :
546 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
547}
548
549/**
550 * dev_add_pack - add packet handler
551 * @pt: packet type declaration
552 *
553 * Add a protocol handler to the networking stack. The passed &packet_type
554 * is linked into kernel lists and may not be freed until it has been
555 * removed from the kernel lists.
556 *
557 * This call does not sleep therefore it can not
558 * guarantee all CPU's that are in middle of receiving packets
559 * will see the new packet type (until the next received packet).
560 */
561
562void dev_add_pack(struct packet_type *pt)
563{
564 struct list_head *head = ptype_head(pt);
565
566 spin_lock(lock: &ptype_lock);
567 list_add_rcu(new: &pt->list, head);
568 spin_unlock(lock: &ptype_lock);
569}
570EXPORT_SYMBOL(dev_add_pack);
571
572/**
573 * __dev_remove_pack - remove packet handler
574 * @pt: packet type declaration
575 *
576 * Remove a protocol handler that was previously added to the kernel
577 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
578 * from the kernel lists and can be freed or reused once this function
579 * returns.
580 *
581 * The packet type might still be in use by receivers
582 * and must not be freed until after all the CPU's have gone
583 * through a quiescent state.
584 */
585void __dev_remove_pack(struct packet_type *pt)
586{
587 struct list_head *head = ptype_head(pt);
588 struct packet_type *pt1;
589
590 spin_lock(lock: &ptype_lock);
591
592 list_for_each_entry(pt1, head, list) {
593 if (pt == pt1) {
594 list_del_rcu(entry: &pt->list);
595 goto out;
596 }
597 }
598
599 pr_warn("dev_remove_pack: %p not found\n", pt);
600out:
601 spin_unlock(lock: &ptype_lock);
602}
603EXPORT_SYMBOL(__dev_remove_pack);
604
605/**
606 * dev_remove_pack - remove packet handler
607 * @pt: packet type declaration
608 *
609 * Remove a protocol handler that was previously added to the kernel
610 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
611 * from the kernel lists and can be freed or reused once this function
612 * returns.
613 *
614 * This call sleeps to guarantee that no CPU is looking at the packet
615 * type after return.
616 */
617void dev_remove_pack(struct packet_type *pt)
618{
619 __dev_remove_pack(pt);
620
621 synchronize_net();
622}
623EXPORT_SYMBOL(dev_remove_pack);
624
625
626/*******************************************************************************
627 *
628 * Device Interface Subroutines
629 *
630 *******************************************************************************/
631
632/**
633 * dev_get_iflink - get 'iflink' value of a interface
634 * @dev: targeted interface
635 *
636 * Indicates the ifindex the interface is linked to.
637 * Physical interfaces have the same 'ifindex' and 'iflink' values.
638 */
639
640int dev_get_iflink(const struct net_device *dev)
641{
642 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
643 return dev->netdev_ops->ndo_get_iflink(dev);
644
645 return READ_ONCE(dev->ifindex);
646}
647EXPORT_SYMBOL(dev_get_iflink);
648
649/**
650 * dev_fill_metadata_dst - Retrieve tunnel egress information.
651 * @dev: targeted interface
652 * @skb: The packet.
653 *
654 * For better visibility of tunnel traffic OVS needs to retrieve
655 * egress tunnel information for a packet. Following API allows
656 * user to get this info.
657 */
658int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
659{
660 struct ip_tunnel_info *info;
661
662 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
663 return -EINVAL;
664
665 info = skb_tunnel_info_unclone(skb);
666 if (!info)
667 return -ENOMEM;
668 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
669 return -EINVAL;
670
671 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
672}
673EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
674
675static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
676{
677 int k = stack->num_paths++;
678
679 if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
680 return NULL;
681
682 return &stack->path[k];
683}
684
685int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
686 struct net_device_path_stack *stack)
687{
688 const struct net_device *last_dev;
689 struct net_device_path_ctx ctx = {
690 .dev = dev,
691 };
692 struct net_device_path *path;
693 int ret = 0;
694
695 memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
696 stack->num_paths = 0;
697 while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
698 last_dev = ctx.dev;
699 path = dev_fwd_path(stack);
700 if (!path)
701 return -1;
702
703 memset(path, 0, sizeof(struct net_device_path));
704 ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
705 if (ret < 0)
706 return -1;
707
708 if (WARN_ON_ONCE(last_dev == ctx.dev))
709 return -1;
710 }
711
712 if (!ctx.dev)
713 return ret;
714
715 path = dev_fwd_path(stack);
716 if (!path)
717 return -1;
718 path->type = DEV_PATH_ETHERNET;
719 path->dev = ctx.dev;
720
721 return ret;
722}
723EXPORT_SYMBOL_GPL(dev_fill_forward_path);
724
725/**
726 * __dev_get_by_name - find a device by its name
727 * @net: the applicable net namespace
728 * @name: name to find
729 *
730 * Find an interface by name. Must be called under RTNL semaphore.
731 * If the name is found a pointer to the device is returned.
732 * If the name is not found then %NULL is returned. The
733 * reference counters are not incremented so the caller must be
734 * careful with locks.
735 */
736
737struct net_device *__dev_get_by_name(struct net *net, const char *name)
738{
739 struct netdev_name_node *node_name;
740
741 node_name = netdev_name_node_lookup(net, name);
742 return node_name ? node_name->dev : NULL;
743}
744EXPORT_SYMBOL(__dev_get_by_name);
745
746/**
747 * dev_get_by_name_rcu - find a device by its name
748 * @net: the applicable net namespace
749 * @name: name to find
750 *
751 * Find an interface by name.
752 * If the name is found a pointer to the device is returned.
753 * If the name is not found then %NULL is returned.
754 * The reference counters are not incremented so the caller must be
755 * careful with locks. The caller must hold RCU lock.
756 */
757
758struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
759{
760 struct netdev_name_node *node_name;
761
762 node_name = netdev_name_node_lookup_rcu(net, name);
763 return node_name ? node_name->dev : NULL;
764}
765EXPORT_SYMBOL(dev_get_by_name_rcu);
766
767/* Deprecated for new users, call netdev_get_by_name() instead */
768struct net_device *dev_get_by_name(struct net *net, const char *name)
769{
770 struct net_device *dev;
771
772 rcu_read_lock();
773 dev = dev_get_by_name_rcu(net, name);
774 dev_hold(dev);
775 rcu_read_unlock();
776 return dev;
777}
778EXPORT_SYMBOL(dev_get_by_name);
779
780/**
781 * netdev_get_by_name() - find a device by its name
782 * @net: the applicable net namespace
783 * @name: name to find
784 * @tracker: tracking object for the acquired reference
785 * @gfp: allocation flags for the tracker
786 *
787 * Find an interface by name. This can be called from any
788 * context and does its own locking. The returned handle has
789 * the usage count incremented and the caller must use netdev_put() to
790 * release it when it is no longer needed. %NULL is returned if no
791 * matching device is found.
792 */
793struct net_device *netdev_get_by_name(struct net *net, const char *name,
794 netdevice_tracker *tracker, gfp_t gfp)
795{
796 struct net_device *dev;
797
798 dev = dev_get_by_name(net, name);
799 if (dev)
800 netdev_tracker_alloc(dev, tracker, gfp);
801 return dev;
802}
803EXPORT_SYMBOL(netdev_get_by_name);
804
805/**
806 * __dev_get_by_index - find a device by its ifindex
807 * @net: the applicable net namespace
808 * @ifindex: index of device
809 *
810 * Search for an interface by index. Returns %NULL if the device
811 * is not found or a pointer to the device. The device has not
812 * had its reference counter increased so the caller must be careful
813 * about locking. The caller must hold the RTNL semaphore.
814 */
815
816struct net_device *__dev_get_by_index(struct net *net, int ifindex)
817{
818 struct net_device *dev;
819 struct hlist_head *head = dev_index_hash(net, ifindex);
820
821 hlist_for_each_entry(dev, head, index_hlist)
822 if (dev->ifindex == ifindex)
823 return dev;
824
825 return NULL;
826}
827EXPORT_SYMBOL(__dev_get_by_index);
828
829/**
830 * dev_get_by_index_rcu - find a device by its ifindex
831 * @net: the applicable net namespace
832 * @ifindex: index of device
833 *
834 * Search for an interface by index. Returns %NULL if the device
835 * is not found or a pointer to the device. The device has not
836 * had its reference counter increased so the caller must be careful
837 * about locking. The caller must hold RCU lock.
838 */
839
840struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
841{
842 struct net_device *dev;
843 struct hlist_head *head = dev_index_hash(net, ifindex);
844
845 hlist_for_each_entry_rcu(dev, head, index_hlist)
846 if (dev->ifindex == ifindex)
847 return dev;
848
849 return NULL;
850}
851EXPORT_SYMBOL(dev_get_by_index_rcu);
852
853/* Deprecated for new users, call netdev_get_by_index() instead */
854struct net_device *dev_get_by_index(struct net *net, int ifindex)
855{
856 struct net_device *dev;
857
858 rcu_read_lock();
859 dev = dev_get_by_index_rcu(net, ifindex);
860 dev_hold(dev);
861 rcu_read_unlock();
862 return dev;
863}
864EXPORT_SYMBOL(dev_get_by_index);
865
866/**
867 * netdev_get_by_index() - find a device by its ifindex
868 * @net: the applicable net namespace
869 * @ifindex: index of device
870 * @tracker: tracking object for the acquired reference
871 * @gfp: allocation flags for the tracker
872 *
873 * Search for an interface by index. Returns NULL if the device
874 * is not found or a pointer to the device. The device returned has
875 * had a reference added and the pointer is safe until the user calls
876 * netdev_put() to indicate they have finished with it.
877 */
878struct net_device *netdev_get_by_index(struct net *net, int ifindex,
879 netdevice_tracker *tracker, gfp_t gfp)
880{
881 struct net_device *dev;
882
883 dev = dev_get_by_index(net, ifindex);
884 if (dev)
885 netdev_tracker_alloc(dev, tracker, gfp);
886 return dev;
887}
888EXPORT_SYMBOL(netdev_get_by_index);
889
890/**
891 * dev_get_by_napi_id - find a device by napi_id
892 * @napi_id: ID of the NAPI struct
893 *
894 * Search for an interface by NAPI ID. Returns %NULL if the device
895 * is not found or a pointer to the device. The device has not had
896 * its reference counter increased so the caller must be careful
897 * about locking. The caller must hold RCU lock.
898 */
899
900struct net_device *dev_get_by_napi_id(unsigned int napi_id)
901{
902 struct napi_struct *napi;
903
904 WARN_ON_ONCE(!rcu_read_lock_held());
905
906 if (napi_id < MIN_NAPI_ID)
907 return NULL;
908
909 napi = napi_by_id(napi_id);
910
911 return napi ? napi->dev : NULL;
912}
913EXPORT_SYMBOL(dev_get_by_napi_id);
914
915/**
916 * netdev_get_name - get a netdevice name, knowing its ifindex.
917 * @net: network namespace
918 * @name: a pointer to the buffer where the name will be stored.
919 * @ifindex: the ifindex of the interface to get the name from.
920 */
921int netdev_get_name(struct net *net, char *name, int ifindex)
922{
923 struct net_device *dev;
924 int ret;
925
926 down_read(sem: &devnet_rename_sem);
927 rcu_read_lock();
928
929 dev = dev_get_by_index_rcu(net, ifindex);
930 if (!dev) {
931 ret = -ENODEV;
932 goto out;
933 }
934
935 strcpy(p: name, q: dev->name);
936
937 ret = 0;
938out:
939 rcu_read_unlock();
940 up_read(sem: &devnet_rename_sem);
941 return ret;
942}
943
944/**
945 * dev_getbyhwaddr_rcu - find a device by its hardware address
946 * @net: the applicable net namespace
947 * @type: media type of device
948 * @ha: hardware address
949 *
950 * Search for an interface by MAC address. Returns NULL if the device
951 * is not found or a pointer to the device.
952 * The caller must hold RCU or RTNL.
953 * The returned device has not had its ref count increased
954 * and the caller must therefore be careful about locking
955 *
956 */
957
958struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
959 const char *ha)
960{
961 struct net_device *dev;
962
963 for_each_netdev_rcu(net, dev)
964 if (dev->type == type &&
965 !memcmp(p: dev->dev_addr, q: ha, size: dev->addr_len))
966 return dev;
967
968 return NULL;
969}
970EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
971
972struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
973{
974 struct net_device *dev, *ret = NULL;
975
976 rcu_read_lock();
977 for_each_netdev_rcu(net, dev)
978 if (dev->type == type) {
979 dev_hold(dev);
980 ret = dev;
981 break;
982 }
983 rcu_read_unlock();
984 return ret;
985}
986EXPORT_SYMBOL(dev_getfirstbyhwtype);
987
988/**
989 * __dev_get_by_flags - find any device with given flags
990 * @net: the applicable net namespace
991 * @if_flags: IFF_* values
992 * @mask: bitmask of bits in if_flags to check
993 *
994 * Search for any interface with the given flags. Returns NULL if a device
995 * is not found or a pointer to the device. Must be called inside
996 * rtnl_lock(), and result refcount is unchanged.
997 */
998
999struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1000 unsigned short mask)
1001{
1002 struct net_device *dev, *ret;
1003
1004 ASSERT_RTNL();
1005
1006 ret = NULL;
1007 for_each_netdev(net, dev) {
1008 if (((dev->flags ^ if_flags) & mask) == 0) {
1009 ret = dev;
1010 break;
1011 }
1012 }
1013 return ret;
1014}
1015EXPORT_SYMBOL(__dev_get_by_flags);
1016
1017/**
1018 * dev_valid_name - check if name is okay for network device
1019 * @name: name string
1020 *
1021 * Network device names need to be valid file names to
1022 * allow sysfs to work. We also disallow any kind of
1023 * whitespace.
1024 */
1025bool dev_valid_name(const char *name)
1026{
1027 if (*name == '\0')
1028 return false;
1029 if (strnlen(p: name, IFNAMSIZ) == IFNAMSIZ)
1030 return false;
1031 if (!strcmp(name, ".") || !strcmp(name, ".."))
1032 return false;
1033
1034 while (*name) {
1035 if (*name == '/' || *name == ':' || isspace(*name))
1036 return false;
1037 name++;
1038 }
1039 return true;
1040}
1041EXPORT_SYMBOL(dev_valid_name);
1042
1043/**
1044 * __dev_alloc_name - allocate a name for a device
1045 * @net: network namespace to allocate the device name in
1046 * @name: name format string
1047 * @res: result name string
1048 *
1049 * Passed a format string - eg "lt%d" it will try and find a suitable
1050 * id. It scans list of devices to build up a free map, then chooses
1051 * the first empty slot. The caller must hold the dev_base or rtnl lock
1052 * while allocating the name and adding the device in order to avoid
1053 * duplicates.
1054 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1055 * Returns the number of the unit assigned or a negative errno code.
1056 */
1057
1058static int __dev_alloc_name(struct net *net, const char *name, char *res)
1059{
1060 int i = 0;
1061 const char *p;
1062 const int max_netdevices = 8*PAGE_SIZE;
1063 unsigned long *inuse;
1064 struct net_device *d;
1065 char buf[IFNAMSIZ];
1066
1067 /* Verify the string as this thing may have come from the user.
1068 * There must be one "%d" and no other "%" characters.
1069 */
1070 p = strchr(name, '%');
1071 if (!p || p[1] != 'd' || strchr(p + 2, '%'))
1072 return -EINVAL;
1073
1074 /* Use one page as a bit array of possible slots */
1075 inuse = bitmap_zalloc(nbits: max_netdevices, GFP_ATOMIC);
1076 if (!inuse)
1077 return -ENOMEM;
1078
1079 for_each_netdev(net, d) {
1080 struct netdev_name_node *name_node;
1081
1082 netdev_for_each_altname(d, name_node) {
1083 if (!sscanf(name_node->name, name, &i))
1084 continue;
1085 if (i < 0 || i >= max_netdevices)
1086 continue;
1087
1088 /* avoid cases where sscanf is not exact inverse of printf */
1089 snprintf(buf, IFNAMSIZ, fmt: name, i);
1090 if (!strncmp(buf, name_node->name, IFNAMSIZ))
1091 __set_bit(i, inuse);
1092 }
1093 if (!sscanf(d->name, name, &i))
1094 continue;
1095 if (i < 0 || i >= max_netdevices)
1096 continue;
1097
1098 /* avoid cases where sscanf is not exact inverse of printf */
1099 snprintf(buf, IFNAMSIZ, fmt: name, i);
1100 if (!strncmp(buf, d->name, IFNAMSIZ))
1101 __set_bit(i, inuse);
1102 }
1103
1104 i = find_first_zero_bit(addr: inuse, size: max_netdevices);
1105 bitmap_free(bitmap: inuse);
1106 if (i == max_netdevices)
1107 return -ENFILE;
1108
1109 /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
1110 strscpy(buf, name, IFNAMSIZ);
1111 snprintf(buf: res, IFNAMSIZ, fmt: buf, i);
1112 return i;
1113}
1114
1115/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
1116static int dev_prep_valid_name(struct net *net, struct net_device *dev,
1117 const char *want_name, char *out_name,
1118 int dup_errno)
1119{
1120 if (!dev_valid_name(want_name))
1121 return -EINVAL;
1122
1123 if (strchr(want_name, '%'))
1124 return __dev_alloc_name(net, name: want_name, res: out_name);
1125
1126 if (netdev_name_in_use(net, want_name))
1127 return -dup_errno;
1128 if (out_name != want_name)
1129 strscpy(out_name, want_name, IFNAMSIZ);
1130 return 0;
1131}
1132
1133/**
1134 * dev_alloc_name - allocate a name for a device
1135 * @dev: device
1136 * @name: name format string
1137 *
1138 * Passed a format string - eg "lt%d" it will try and find a suitable
1139 * id. It scans list of devices to build up a free map, then chooses
1140 * the first empty slot. The caller must hold the dev_base or rtnl lock
1141 * while allocating the name and adding the device in order to avoid
1142 * duplicates.
1143 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1144 * Returns the number of the unit assigned or a negative errno code.
1145 */
1146
1147int dev_alloc_name(struct net_device *dev, const char *name)
1148{
1149 return dev_prep_valid_name(net: dev_net(dev), dev, want_name: name, out_name: dev->name, ENFILE);
1150}
1151EXPORT_SYMBOL(dev_alloc_name);
1152
1153static int dev_get_valid_name(struct net *net, struct net_device *dev,
1154 const char *name)
1155{
1156 int ret;
1157
1158 ret = dev_prep_valid_name(net, dev, want_name: name, out_name: dev->name, EEXIST);
1159 return ret < 0 ? ret : 0;
1160}
1161
1162/**
1163 * dev_change_name - change name of a device
1164 * @dev: device
1165 * @newname: name (or format string) must be at least IFNAMSIZ
1166 *
1167 * Change name of a device, can pass format strings "eth%d".
1168 * for wildcarding.
1169 */
1170int dev_change_name(struct net_device *dev, const char *newname)
1171{
1172 unsigned char old_assign_type;
1173 char oldname[IFNAMSIZ];
1174 int err = 0;
1175 int ret;
1176 struct net *net;
1177
1178 ASSERT_RTNL();
1179 BUG_ON(!dev_net(dev));
1180
1181 net = dev_net(dev);
1182
1183 down_write(sem: &devnet_rename_sem);
1184
1185 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1186 up_write(sem: &devnet_rename_sem);
1187 return 0;
1188 }
1189
1190 memcpy(oldname, dev->name, IFNAMSIZ);
1191
1192 err = dev_get_valid_name(net, dev, name: newname);
1193 if (err < 0) {
1194 up_write(sem: &devnet_rename_sem);
1195 return err;
1196 }
1197
1198 if (oldname[0] && !strchr(oldname, '%'))
1199 netdev_info(dev, format: "renamed from %s%s\n", oldname,
1200 dev->flags & IFF_UP ? " (while UP)" : "");
1201
1202 old_assign_type = dev->name_assign_type;
1203 WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
1204
1205rollback:
1206 ret = device_rename(dev: &dev->dev, new_name: dev->name);
1207 if (ret) {
1208 memcpy(dev->name, oldname, IFNAMSIZ);
1209 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1210 up_write(sem: &devnet_rename_sem);
1211 return ret;
1212 }
1213
1214 up_write(sem: &devnet_rename_sem);
1215
1216 netdev_adjacent_rename_links(dev, oldname);
1217
1218 netdev_name_node_del(name_node: dev->name_node);
1219
1220 synchronize_net();
1221
1222 netdev_name_node_add(net, name_node: dev->name_node);
1223
1224 ret = call_netdevice_notifiers(val: NETDEV_CHANGENAME, dev);
1225 ret = notifier_to_errno(ret);
1226
1227 if (ret) {
1228 /* err >= 0 after dev_alloc_name() or stores the first errno */
1229 if (err >= 0) {
1230 err = ret;
1231 down_write(sem: &devnet_rename_sem);
1232 memcpy(dev->name, oldname, IFNAMSIZ);
1233 memcpy(oldname, newname, IFNAMSIZ);
1234 WRITE_ONCE(dev->name_assign_type, old_assign_type);
1235 old_assign_type = NET_NAME_RENAMED;
1236 goto rollback;
1237 } else {
1238 netdev_err(dev, format: "name change rollback failed: %d\n",
1239 ret);
1240 }
1241 }
1242
1243 return err;
1244}
1245
1246/**
1247 * dev_set_alias - change ifalias of a device
1248 * @dev: device
1249 * @alias: name up to IFALIASZ
1250 * @len: limit of bytes to copy from info
1251 *
1252 * Set ifalias for a device,
1253 */
1254int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1255{
1256 struct dev_ifalias *new_alias = NULL;
1257
1258 if (len >= IFALIASZ)
1259 return -EINVAL;
1260
1261 if (len) {
1262 new_alias = kmalloc(size: sizeof(*new_alias) + len + 1, GFP_KERNEL);
1263 if (!new_alias)
1264 return -ENOMEM;
1265
1266 memcpy(new_alias->ifalias, alias, len);
1267 new_alias->ifalias[len] = 0;
1268 }
1269
1270 mutex_lock(&ifalias_mutex);
1271 new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1272 mutex_is_locked(&ifalias_mutex));
1273 mutex_unlock(lock: &ifalias_mutex);
1274
1275 if (new_alias)
1276 kfree_rcu(new_alias, rcuhead);
1277
1278 return len;
1279}
1280EXPORT_SYMBOL(dev_set_alias);
1281
1282/**
1283 * dev_get_alias - get ifalias of a device
1284 * @dev: device
1285 * @name: buffer to store name of ifalias
1286 * @len: size of buffer
1287 *
1288 * get ifalias for a device. Caller must make sure dev cannot go
1289 * away, e.g. rcu read lock or own a reference count to device.
1290 */
1291int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1292{
1293 const struct dev_ifalias *alias;
1294 int ret = 0;
1295
1296 rcu_read_lock();
1297 alias = rcu_dereference(dev->ifalias);
1298 if (alias)
1299 ret = snprintf(buf: name, size: len, fmt: "%s", alias->ifalias);
1300 rcu_read_unlock();
1301
1302 return ret;
1303}
1304
1305/**
1306 * netdev_features_change - device changes features
1307 * @dev: device to cause notification
1308 *
1309 * Called to indicate a device has changed features.
1310 */
1311void netdev_features_change(struct net_device *dev)
1312{
1313 call_netdevice_notifiers(val: NETDEV_FEAT_CHANGE, dev);
1314}
1315EXPORT_SYMBOL(netdev_features_change);
1316
1317/**
1318 * netdev_state_change - device changes state
1319 * @dev: device to cause notification
1320 *
1321 * Called to indicate a device has changed state. This function calls
1322 * the notifier chains for netdev_chain and sends a NEWLINK message
1323 * to the routing socket.
1324 */
1325void netdev_state_change(struct net_device *dev)
1326{
1327 if (dev->flags & IFF_UP) {
1328 struct netdev_notifier_change_info change_info = {
1329 .info.dev = dev,
1330 };
1331
1332 call_netdevice_notifiers_info(val: NETDEV_CHANGE,
1333 info: &change_info.info);
1334 rtmsg_ifinfo(RTM_NEWLINK, dev, change: 0, GFP_KERNEL, portid: 0, NULL);
1335 }
1336}
1337EXPORT_SYMBOL(netdev_state_change);
1338
1339/**
1340 * __netdev_notify_peers - notify network peers about existence of @dev,
1341 * to be called when rtnl lock is already held.
1342 * @dev: network device
1343 *
1344 * Generate traffic such that interested network peers are aware of
1345 * @dev, such as by generating a gratuitous ARP. This may be used when
1346 * a device wants to inform the rest of the network about some sort of
1347 * reconfiguration such as a failover event or virtual machine
1348 * migration.
1349 */
1350void __netdev_notify_peers(struct net_device *dev)
1351{
1352 ASSERT_RTNL();
1353 call_netdevice_notifiers(val: NETDEV_NOTIFY_PEERS, dev);
1354 call_netdevice_notifiers(val: NETDEV_RESEND_IGMP, dev);
1355}
1356EXPORT_SYMBOL(__netdev_notify_peers);
1357
1358/**
1359 * netdev_notify_peers - notify network peers about existence of @dev
1360 * @dev: network device
1361 *
1362 * Generate traffic such that interested network peers are aware of
1363 * @dev, such as by generating a gratuitous ARP. This may be used when
1364 * a device wants to inform the rest of the network about some sort of
1365 * reconfiguration such as a failover event or virtual machine
1366 * migration.
1367 */
1368void netdev_notify_peers(struct net_device *dev)
1369{
1370 rtnl_lock();
1371 __netdev_notify_peers(dev);
1372 rtnl_unlock();
1373}
1374EXPORT_SYMBOL(netdev_notify_peers);
1375
1376static int napi_threaded_poll(void *data);
1377
1378static int napi_kthread_create(struct napi_struct *n)
1379{
1380 int err = 0;
1381
1382 /* Create and wake up the kthread once to put it in
1383 * TASK_INTERRUPTIBLE mode to avoid the blocked task
1384 * warning and work with loadavg.
1385 */
1386 n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1387 n->dev->name, n->napi_id);
1388 if (IS_ERR(ptr: n->thread)) {
1389 err = PTR_ERR(ptr: n->thread);
1390 pr_err("kthread_run failed with err %d\n", err);
1391 n->thread = NULL;
1392 }
1393
1394 return err;
1395}
1396
1397static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1398{
1399 const struct net_device_ops *ops = dev->netdev_ops;
1400 int ret;
1401
1402 ASSERT_RTNL();
1403 dev_addr_check(dev);
1404
1405 if (!netif_device_present(dev)) {
1406 /* may be detached because parent is runtime-suspended */
1407 if (dev->dev.parent)
1408 pm_runtime_resume(dev: dev->dev.parent);
1409 if (!netif_device_present(dev))
1410 return -ENODEV;
1411 }
1412
1413 /* Block netpoll from trying to do any rx path servicing.
1414 * If we don't do this there is a chance ndo_poll_controller
1415 * or ndo_poll may be running while we open the device
1416 */
1417 netpoll_poll_disable(dev);
1418
1419 ret = call_netdevice_notifiers_extack(val: NETDEV_PRE_UP, dev, extack);
1420 ret = notifier_to_errno(ret);
1421 if (ret)
1422 return ret;
1423
1424 set_bit(nr: __LINK_STATE_START, addr: &dev->state);
1425
1426 if (ops->ndo_validate_addr)
1427 ret = ops->ndo_validate_addr(dev);
1428
1429 if (!ret && ops->ndo_open)
1430 ret = ops->ndo_open(dev);
1431
1432 netpoll_poll_enable(dev);
1433
1434 if (ret)
1435 clear_bit(nr: __LINK_STATE_START, addr: &dev->state);
1436 else {
1437 dev->flags |= IFF_UP;
1438 dev_set_rx_mode(dev);
1439 dev_activate(dev);
1440 add_device_randomness(buf: dev->dev_addr, len: dev->addr_len);
1441 }
1442
1443 return ret;
1444}
1445
1446/**
1447 * dev_open - prepare an interface for use.
1448 * @dev: device to open
1449 * @extack: netlink extended ack
1450 *
1451 * Takes a device from down to up state. The device's private open
1452 * function is invoked and then the multicast lists are loaded. Finally
1453 * the device is moved into the up state and a %NETDEV_UP message is
1454 * sent to the netdev notifier chain.
1455 *
1456 * Calling this function on an active interface is a nop. On a failure
1457 * a negative errno code is returned.
1458 */
1459int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1460{
1461 int ret;
1462
1463 if (dev->flags & IFF_UP)
1464 return 0;
1465
1466 ret = __dev_open(dev, extack);
1467 if (ret < 0)
1468 return ret;
1469
1470 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, portid: 0, NULL);
1471 call_netdevice_notifiers(val: NETDEV_UP, dev);
1472
1473 return ret;
1474}
1475EXPORT_SYMBOL(dev_open);
1476
1477static void __dev_close_many(struct list_head *head)
1478{
1479 struct net_device *dev;
1480
1481 ASSERT_RTNL();
1482 might_sleep();
1483
1484 list_for_each_entry(dev, head, close_list) {
1485 /* Temporarily disable netpoll until the interface is down */
1486 netpoll_poll_disable(dev);
1487
1488 call_netdevice_notifiers(val: NETDEV_GOING_DOWN, dev);
1489
1490 clear_bit(nr: __LINK_STATE_START, addr: &dev->state);
1491
1492 /* Synchronize to scheduled poll. We cannot touch poll list, it
1493 * can be even on different cpu. So just clear netif_running().
1494 *
1495 * dev->stop() will invoke napi_disable() on all of it's
1496 * napi_struct instances on this device.
1497 */
1498 smp_mb__after_atomic(); /* Commit netif_running(). */
1499 }
1500
1501 dev_deactivate_many(head);
1502
1503 list_for_each_entry(dev, head, close_list) {
1504 const struct net_device_ops *ops = dev->netdev_ops;
1505
1506 /*
1507 * Call the device specific close. This cannot fail.
1508 * Only if device is UP
1509 *
1510 * We allow it to be called even after a DETACH hot-plug
1511 * event.
1512 */
1513 if (ops->ndo_stop)
1514 ops->ndo_stop(dev);
1515
1516 dev->flags &= ~IFF_UP;
1517 netpoll_poll_enable(dev);
1518 }
1519}
1520
1521static void __dev_close(struct net_device *dev)
1522{
1523 LIST_HEAD(single);
1524
1525 list_add(new: &dev->close_list, head: &single);
1526 __dev_close_many(head: &single);
1527 list_del(entry: &single);
1528}
1529
1530void dev_close_many(struct list_head *head, bool unlink)
1531{
1532 struct net_device *dev, *tmp;
1533
1534 /* Remove the devices that don't need to be closed */
1535 list_for_each_entry_safe(dev, tmp, head, close_list)
1536 if (!(dev->flags & IFF_UP))
1537 list_del_init(entry: &dev->close_list);
1538
1539 __dev_close_many(head);
1540
1541 list_for_each_entry_safe(dev, tmp, head, close_list) {
1542 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, portid: 0, NULL);
1543 call_netdevice_notifiers(val: NETDEV_DOWN, dev);
1544 if (unlink)
1545 list_del_init(entry: &dev->close_list);
1546 }
1547}
1548EXPORT_SYMBOL(dev_close_many);
1549
1550/**
1551 * dev_close - shutdown an interface.
1552 * @dev: device to shutdown
1553 *
1554 * This function moves an active device into down state. A
1555 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1556 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1557 * chain.
1558 */
1559void dev_close(struct net_device *dev)
1560{
1561 if (dev->flags & IFF_UP) {
1562 LIST_HEAD(single);
1563
1564 list_add(new: &dev->close_list, head: &single);
1565 dev_close_many(&single, true);
1566 list_del(entry: &single);
1567 }
1568}
1569EXPORT_SYMBOL(dev_close);
1570
1571
1572/**
1573 * dev_disable_lro - disable Large Receive Offload on a device
1574 * @dev: device
1575 *
1576 * Disable Large Receive Offload (LRO) on a net device. Must be
1577 * called under RTNL. This is needed if received packets may be
1578 * forwarded to another interface.
1579 */
1580void dev_disable_lro(struct net_device *dev)
1581{
1582 struct net_device *lower_dev;
1583 struct list_head *iter;
1584
1585 dev->wanted_features &= ~NETIF_F_LRO;
1586 netdev_update_features(dev);
1587
1588 if (unlikely(dev->features & NETIF_F_LRO))
1589 netdev_WARN(dev, "failed to disable LRO!\n");
1590
1591 netdev_for_each_lower_dev(dev, lower_dev, iter)
1592 dev_disable_lro(dev: lower_dev);
1593}
1594EXPORT_SYMBOL(dev_disable_lro);
1595
1596/**
1597 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1598 * @dev: device
1599 *
1600 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1601 * called under RTNL. This is needed if Generic XDP is installed on
1602 * the device.
1603 */
1604static void dev_disable_gro_hw(struct net_device *dev)
1605{
1606 dev->wanted_features &= ~NETIF_F_GRO_HW;
1607 netdev_update_features(dev);
1608
1609 if (unlikely(dev->features & NETIF_F_GRO_HW))
1610 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1611}
1612
1613const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1614{
1615#define N(val) \
1616 case NETDEV_##val: \
1617 return "NETDEV_" __stringify(val);
1618 switch (cmd) {
1619 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1620 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1621 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1622 N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
1623 N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
1624 N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
1625 N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1626 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1627 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1628 N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
1629 N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
1630 N(XDP_FEAT_CHANGE)
1631 }
1632#undef N
1633 return "UNKNOWN_NETDEV_EVENT";
1634}
1635EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1636
1637static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1638 struct net_device *dev)
1639{
1640 struct netdev_notifier_info info = {
1641 .dev = dev,
1642 };
1643
1644 return nb->notifier_call(nb, val, &info);
1645}
1646
1647static int call_netdevice_register_notifiers(struct notifier_block *nb,
1648 struct net_device *dev)
1649{
1650 int err;
1651
1652 err = call_netdevice_notifier(nb, val: NETDEV_REGISTER, dev);
1653 err = notifier_to_errno(ret: err);
1654 if (err)
1655 return err;
1656
1657 if (!(dev->flags & IFF_UP))
1658 return 0;
1659
1660 call_netdevice_notifier(nb, val: NETDEV_UP, dev);
1661 return 0;
1662}
1663
1664static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1665 struct net_device *dev)
1666{
1667 if (dev->flags & IFF_UP) {
1668 call_netdevice_notifier(nb, val: NETDEV_GOING_DOWN,
1669 dev);
1670 call_netdevice_notifier(nb, val: NETDEV_DOWN, dev);
1671 }
1672 call_netdevice_notifier(nb, val: NETDEV_UNREGISTER, dev);
1673}
1674
1675static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1676 struct net *net)
1677{
1678 struct net_device *dev;
1679 int err;
1680
1681 for_each_netdev(net, dev) {
1682 err = call_netdevice_register_notifiers(nb, dev);
1683 if (err)
1684 goto rollback;
1685 }
1686 return 0;
1687
1688rollback:
1689 for_each_netdev_continue_reverse(net, dev)
1690 call_netdevice_unregister_notifiers(nb, dev);
1691 return err;
1692}
1693
1694static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1695 struct net *net)
1696{
1697 struct net_device *dev;
1698
1699 for_each_netdev(net, dev)
1700 call_netdevice_unregister_notifiers(nb, dev);
1701}
1702
1703static int dev_boot_phase = 1;
1704
1705/**
1706 * register_netdevice_notifier - register a network notifier block
1707 * @nb: notifier
1708 *
1709 * Register a notifier to be called when network device events occur.
1710 * The notifier passed is linked into the kernel structures and must
1711 * not be reused until it has been unregistered. A negative errno code
1712 * is returned on a failure.
1713 *
1714 * When registered all registration and up events are replayed
1715 * to the new notifier to allow device to have a race free
1716 * view of the network device list.
1717 */
1718
1719int register_netdevice_notifier(struct notifier_block *nb)
1720{
1721 struct net *net;
1722 int err;
1723
1724 /* Close race with setup_net() and cleanup_net() */
1725 down_write(sem: &pernet_ops_rwsem);
1726 rtnl_lock();
1727 err = raw_notifier_chain_register(nh: &netdev_chain, nb);
1728 if (err)
1729 goto unlock;
1730 if (dev_boot_phase)
1731 goto unlock;
1732 for_each_net(net) {
1733 err = call_netdevice_register_net_notifiers(nb, net);
1734 if (err)
1735 goto rollback;
1736 }
1737
1738unlock:
1739 rtnl_unlock();
1740 up_write(sem: &pernet_ops_rwsem);
1741 return err;
1742
1743rollback:
1744 for_each_net_continue_reverse(net)
1745 call_netdevice_unregister_net_notifiers(nb, net);
1746
1747 raw_notifier_chain_unregister(nh: &netdev_chain, nb);
1748 goto unlock;
1749}
1750EXPORT_SYMBOL(register_netdevice_notifier);
1751
1752/**
1753 * unregister_netdevice_notifier - unregister a network notifier block
1754 * @nb: notifier
1755 *
1756 * Unregister a notifier previously registered by
1757 * register_netdevice_notifier(). The notifier is unlinked into the
1758 * kernel structures and may then be reused. A negative errno code
1759 * is returned on a failure.
1760 *
1761 * After unregistering unregister and down device events are synthesized
1762 * for all devices on the device list to the removed notifier to remove
1763 * the need for special case cleanup code.
1764 */
1765
1766int unregister_netdevice_notifier(struct notifier_block *nb)
1767{
1768 struct net *net;
1769 int err;
1770
1771 /* Close race with setup_net() and cleanup_net() */
1772 down_write(sem: &pernet_ops_rwsem);
1773 rtnl_lock();
1774 err = raw_notifier_chain_unregister(nh: &netdev_chain, nb);
1775 if (err)
1776 goto unlock;
1777
1778 for_each_net(net)
1779 call_netdevice_unregister_net_notifiers(nb, net);
1780
1781unlock:
1782 rtnl_unlock();
1783 up_write(sem: &pernet_ops_rwsem);
1784 return err;
1785}
1786EXPORT_SYMBOL(unregister_netdevice_notifier);
1787
1788static int __register_netdevice_notifier_net(struct net *net,
1789 struct notifier_block *nb,
1790 bool ignore_call_fail)
1791{
1792 int err;
1793
1794 err = raw_notifier_chain_register(nh: &net->netdev_chain, nb);
1795 if (err)
1796 return err;
1797 if (dev_boot_phase)
1798 return 0;
1799
1800 err = call_netdevice_register_net_notifiers(nb, net);
1801 if (err && !ignore_call_fail)
1802 goto chain_unregister;
1803
1804 return 0;
1805
1806chain_unregister:
1807 raw_notifier_chain_unregister(nh: &net->netdev_chain, nb);
1808 return err;
1809}
1810
1811static int __unregister_netdevice_notifier_net(struct net *net,
1812 struct notifier_block *nb)
1813{
1814 int err;
1815
1816 err = raw_notifier_chain_unregister(nh: &net->netdev_chain, nb);
1817 if (err)
1818 return err;
1819
1820 call_netdevice_unregister_net_notifiers(nb, net);
1821 return 0;
1822}
1823
1824/**
1825 * register_netdevice_notifier_net - register a per-netns network notifier block
1826 * @net: network namespace
1827 * @nb: notifier
1828 *
1829 * Register a notifier to be called when network device events occur.
1830 * The notifier passed is linked into the kernel structures and must
1831 * not be reused until it has been unregistered. A negative errno code
1832 * is returned on a failure.
1833 *
1834 * When registered all registration and up events are replayed
1835 * to the new notifier to allow device to have a race free
1836 * view of the network device list.
1837 */
1838
1839int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1840{
1841 int err;
1842
1843 rtnl_lock();
1844 err = __register_netdevice_notifier_net(net, nb, ignore_call_fail: false);
1845 rtnl_unlock();
1846 return err;
1847}
1848EXPORT_SYMBOL(register_netdevice_notifier_net);
1849
1850/**
1851 * unregister_netdevice_notifier_net - unregister a per-netns
1852 * network notifier block
1853 * @net: network namespace
1854 * @nb: notifier
1855 *
1856 * Unregister a notifier previously registered by
1857 * register_netdevice_notifier_net(). The notifier is unlinked from the
1858 * kernel structures and may then be reused. A negative errno code
1859 * is returned on a failure.
1860 *
1861 * After unregistering unregister and down device events are synthesized
1862 * for all devices on the device list to the removed notifier to remove
1863 * the need for special case cleanup code.
1864 */
1865
1866int unregister_netdevice_notifier_net(struct net *net,
1867 struct notifier_block *nb)
1868{
1869 int err;
1870
1871 rtnl_lock();
1872 err = __unregister_netdevice_notifier_net(net, nb);
1873 rtnl_unlock();
1874 return err;
1875}
1876EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1877
1878static void __move_netdevice_notifier_net(struct net *src_net,
1879 struct net *dst_net,
1880 struct notifier_block *nb)
1881{
1882 __unregister_netdevice_notifier_net(net: src_net, nb);
1883 __register_netdevice_notifier_net(net: dst_net, nb, ignore_call_fail: true);
1884}
1885
1886int register_netdevice_notifier_dev_net(struct net_device *dev,
1887 struct notifier_block *nb,
1888 struct netdev_net_notifier *nn)
1889{
1890 int err;
1891
1892 rtnl_lock();
1893 err = __register_netdevice_notifier_net(net: dev_net(dev), nb, ignore_call_fail: false);
1894 if (!err) {
1895 nn->nb = nb;
1896 list_add(new: &nn->list, head: &dev->net_notifier_list);
1897 }
1898 rtnl_unlock();
1899 return err;
1900}
1901EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
1902
1903int unregister_netdevice_notifier_dev_net(struct net_device *dev,
1904 struct notifier_block *nb,
1905 struct netdev_net_notifier *nn)
1906{
1907 int err;
1908
1909 rtnl_lock();
1910 list_del(entry: &nn->list);
1911 err = __unregister_netdevice_notifier_net(net: dev_net(dev), nb);
1912 rtnl_unlock();
1913 return err;
1914}
1915EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
1916
1917static void move_netdevice_notifiers_dev_net(struct net_device *dev,
1918 struct net *net)
1919{
1920 struct netdev_net_notifier *nn;
1921
1922 list_for_each_entry(nn, &dev->net_notifier_list, list)
1923 __move_netdevice_notifier_net(src_net: dev_net(dev), dst_net: net, nb: nn->nb);
1924}
1925
1926/**
1927 * call_netdevice_notifiers_info - call all network notifier blocks
1928 * @val: value passed unmodified to notifier function
1929 * @info: notifier information data
1930 *
1931 * Call all network notifier blocks. Parameters and return value
1932 * are as for raw_notifier_call_chain().
1933 */
1934
1935int call_netdevice_notifiers_info(unsigned long val,
1936 struct netdev_notifier_info *info)
1937{
1938 struct net *net = dev_net(dev: info->dev);
1939 int ret;
1940
1941 ASSERT_RTNL();
1942
1943 /* Run per-netns notifier block chain first, then run the global one.
1944 * Hopefully, one day, the global one is going to be removed after
1945 * all notifier block registrators get converted to be per-netns.
1946 */
1947 ret = raw_notifier_call_chain(nh: &net->netdev_chain, val, v: info);
1948 if (ret & NOTIFY_STOP_MASK)
1949 return ret;
1950 return raw_notifier_call_chain(nh: &netdev_chain, val, v: info);
1951}
1952
1953/**
1954 * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
1955 * for and rollback on error
1956 * @val_up: value passed unmodified to notifier function
1957 * @val_down: value passed unmodified to the notifier function when
1958 * recovering from an error on @val_up
1959 * @info: notifier information data
1960 *
1961 * Call all per-netns network notifier blocks, but not notifier blocks on
1962 * the global notifier chain. Parameters and return value are as for
1963 * raw_notifier_call_chain_robust().
1964 */
1965
1966static int
1967call_netdevice_notifiers_info_robust(unsigned long val_up,
1968 unsigned long val_down,
1969 struct netdev_notifier_info *info)
1970{
1971 struct net *net = dev_net(dev: info->dev);
1972
1973 ASSERT_RTNL();
1974
1975 return raw_notifier_call_chain_robust(nh: &net->netdev_chain,
1976 val_up, val_down, v: info);
1977}
1978
1979static int call_netdevice_notifiers_extack(unsigned long val,
1980 struct net_device *dev,
1981 struct netlink_ext_ack *extack)
1982{
1983 struct netdev_notifier_info info = {
1984 .dev = dev,
1985 .extack = extack,
1986 };
1987
1988 return call_netdevice_notifiers_info(val, info: &info);
1989}
1990
1991/**
1992 * call_netdevice_notifiers - call all network notifier blocks
1993 * @val: value passed unmodified to notifier function
1994 * @dev: net_device pointer passed unmodified to notifier function
1995 *
1996 * Call all network notifier blocks. Parameters and return value
1997 * are as for raw_notifier_call_chain().
1998 */
1999
2000int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2001{
2002 return call_netdevice_notifiers_extack(val, dev, NULL);
2003}
2004EXPORT_SYMBOL(call_netdevice_notifiers);
2005
2006/**
2007 * call_netdevice_notifiers_mtu - call all network notifier blocks
2008 * @val: value passed unmodified to notifier function
2009 * @dev: net_device pointer passed unmodified to notifier function
2010 * @arg: additional u32 argument passed to the notifier function
2011 *
2012 * Call all network notifier blocks. Parameters and return value
2013 * are as for raw_notifier_call_chain().
2014 */
2015static int call_netdevice_notifiers_mtu(unsigned long val,
2016 struct net_device *dev, u32 arg)
2017{
2018 struct netdev_notifier_info_ext info = {
2019 .info.dev = dev,
2020 .ext.mtu = arg,
2021 };
2022
2023 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2024
2025 return call_netdevice_notifiers_info(val, info: &info.info);
2026}
2027
2028#ifdef CONFIG_NET_INGRESS
2029static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2030
2031void net_inc_ingress_queue(void)
2032{
2033 static_branch_inc(&ingress_needed_key);
2034}
2035EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2036
2037void net_dec_ingress_queue(void)
2038{
2039 static_branch_dec(&ingress_needed_key);
2040}
2041EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2042#endif
2043
2044#ifdef CONFIG_NET_EGRESS
2045static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2046
2047void net_inc_egress_queue(void)
2048{
2049 static_branch_inc(&egress_needed_key);
2050}
2051EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2052
2053void net_dec_egress_queue(void)
2054{
2055 static_branch_dec(&egress_needed_key);
2056}
2057EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2058#endif
2059
2060DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2061EXPORT_SYMBOL(netstamp_needed_key);
2062#ifdef CONFIG_JUMP_LABEL
2063static atomic_t netstamp_needed_deferred;
2064static atomic_t netstamp_wanted;
2065static void netstamp_clear(struct work_struct *work)
2066{
2067 int deferred = atomic_xchg(v: &netstamp_needed_deferred, new: 0);
2068 int wanted;
2069
2070 wanted = atomic_add_return(i: deferred, v: &netstamp_wanted);
2071 if (wanted > 0)
2072 static_branch_enable(&netstamp_needed_key);
2073 else
2074 static_branch_disable(&netstamp_needed_key);
2075}
2076static DECLARE_WORK(netstamp_work, netstamp_clear);
2077#endif
2078
2079void net_enable_timestamp(void)
2080{
2081#ifdef CONFIG_JUMP_LABEL
2082 int wanted = atomic_read(v: &netstamp_wanted);
2083
2084 while (wanted > 0) {
2085 if (atomic_try_cmpxchg(v: &netstamp_wanted, old: &wanted, new: wanted + 1))
2086 return;
2087 }
2088 atomic_inc(v: &netstamp_needed_deferred);
2089 schedule_work(work: &netstamp_work);
2090#else
2091 static_branch_inc(&netstamp_needed_key);
2092#endif
2093}
2094EXPORT_SYMBOL(net_enable_timestamp);
2095
2096void net_disable_timestamp(void)
2097{
2098#ifdef CONFIG_JUMP_LABEL
2099 int wanted = atomic_read(v: &netstamp_wanted);
2100
2101 while (wanted > 1) {
2102 if (atomic_try_cmpxchg(v: &netstamp_wanted, old: &wanted, new: wanted - 1))
2103 return;
2104 }
2105 atomic_dec(v: &netstamp_needed_deferred);
2106 schedule_work(work: &netstamp_work);
2107#else
2108 static_branch_dec(&netstamp_needed_key);
2109#endif
2110}
2111EXPORT_SYMBOL(net_disable_timestamp);
2112
2113static inline void net_timestamp_set(struct sk_buff *skb)
2114{
2115 skb->tstamp = 0;
2116 skb->mono_delivery_time = 0;
2117 if (static_branch_unlikely(&netstamp_needed_key))
2118 skb->tstamp = ktime_get_real();
2119}
2120
2121#define net_timestamp_check(COND, SKB) \
2122 if (static_branch_unlikely(&netstamp_needed_key)) { \
2123 if ((COND) && !(SKB)->tstamp) \
2124 (SKB)->tstamp = ktime_get_real(); \
2125 } \
2126
2127bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2128{
2129 return __is_skb_forwardable(dev, skb, check_mtu: true);
2130}
2131EXPORT_SYMBOL_GPL(is_skb_forwardable);
2132
2133static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2134 bool check_mtu)
2135{
2136 int ret = ____dev_forward_skb(dev, skb, check_mtu);
2137
2138 if (likely(!ret)) {
2139 skb->protocol = eth_type_trans(skb, dev);
2140 skb_postpull_rcsum(skb, start: eth_hdr(skb), ETH_HLEN);
2141 }
2142
2143 return ret;
2144}
2145
2146int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2147{
2148 return __dev_forward_skb2(dev, skb, check_mtu: true);
2149}
2150EXPORT_SYMBOL_GPL(__dev_forward_skb);
2151
2152/**
2153 * dev_forward_skb - loopback an skb to another netif
2154 *
2155 * @dev: destination network device
2156 * @skb: buffer to forward
2157 *
2158 * return values:
2159 * NET_RX_SUCCESS (no congestion)
2160 * NET_RX_DROP (packet was dropped, but freed)
2161 *
2162 * dev_forward_skb can be used for injecting an skb from the
2163 * start_xmit function of one device into the receive queue
2164 * of another device.
2165 *
2166 * The receiving device may be in another namespace, so
2167 * we have to clear all information in the skb that could
2168 * impact namespace isolation.
2169 */
2170int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2171{
2172 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2173}
2174EXPORT_SYMBOL_GPL(dev_forward_skb);
2175
2176int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2177{
2178 return __dev_forward_skb2(dev, skb, check_mtu: false) ?: netif_rx_internal(skb);
2179}
2180
2181static inline int deliver_skb(struct sk_buff *skb,
2182 struct packet_type *pt_prev,
2183 struct net_device *orig_dev)
2184{
2185 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2186 return -ENOMEM;
2187 refcount_inc(r: &skb->users);
2188 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2189}
2190
2191static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2192 struct packet_type **pt,
2193 struct net_device *orig_dev,
2194 __be16 type,
2195 struct list_head *ptype_list)
2196{
2197 struct packet_type *ptype, *pt_prev = *pt;
2198
2199 list_for_each_entry_rcu(ptype, ptype_list, list) {
2200 if (ptype->type != type)
2201 continue;
2202 if (pt_prev)
2203 deliver_skb(skb, pt_prev, orig_dev);
2204 pt_prev = ptype;
2205 }
2206 *pt = pt_prev;
2207}
2208
2209static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2210{
2211 if (!ptype->af_packet_priv || !skb->sk)
2212 return false;
2213
2214 if (ptype->id_match)
2215 return ptype->id_match(ptype, skb->sk);
2216 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2217 return true;
2218
2219 return false;
2220}
2221
2222/**
2223 * dev_nit_active - return true if any network interface taps are in use
2224 *
2225 * @dev: network device to check for the presence of taps
2226 */
2227bool dev_nit_active(struct net_device *dev)
2228{
2229 return !list_empty(head: &net_hotdata.ptype_all) ||
2230 !list_empty(head: &dev->ptype_all);
2231}
2232EXPORT_SYMBOL_GPL(dev_nit_active);
2233
2234/*
2235 * Support routine. Sends outgoing frames to any network
2236 * taps currently in use.
2237 */
2238
2239void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2240{
2241 struct list_head *ptype_list = &net_hotdata.ptype_all;
2242 struct packet_type *ptype, *pt_prev = NULL;
2243 struct sk_buff *skb2 = NULL;
2244
2245 rcu_read_lock();
2246again:
2247 list_for_each_entry_rcu(ptype, ptype_list, list) {
2248 if (READ_ONCE(ptype->ignore_outgoing))
2249 continue;
2250
2251 /* Never send packets back to the socket
2252 * they originated from - MvS (miquels@drinkel.ow.org)
2253 */
2254 if (skb_loop_sk(ptype, skb))
2255 continue;
2256
2257 if (pt_prev) {
2258 deliver_skb(skb: skb2, pt_prev, orig_dev: skb->dev);
2259 pt_prev = ptype;
2260 continue;
2261 }
2262
2263 /* need to clone skb, done only once */
2264 skb2 = skb_clone(skb, GFP_ATOMIC);
2265 if (!skb2)
2266 goto out_unlock;
2267
2268 net_timestamp_set(skb: skb2);
2269
2270 /* skb->nh should be correctly
2271 * set by sender, so that the second statement is
2272 * just protection against buggy protocols.
2273 */
2274 skb_reset_mac_header(skb: skb2);
2275
2276 if (skb_network_header(skb: skb2) < skb2->data ||
2277 skb_network_header(skb: skb2) > skb_tail_pointer(skb: skb2)) {
2278 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2279 ntohs(skb2->protocol),
2280 dev->name);
2281 skb_reset_network_header(skb: skb2);
2282 }
2283
2284 skb2->transport_header = skb2->network_header;
2285 skb2->pkt_type = PACKET_OUTGOING;
2286 pt_prev = ptype;
2287 }
2288
2289 if (ptype_list == &net_hotdata.ptype_all) {
2290 ptype_list = &dev->ptype_all;
2291 goto again;
2292 }
2293out_unlock:
2294 if (pt_prev) {
2295 if (!skb_orphan_frags_rx(skb: skb2, GFP_ATOMIC))
2296 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2297 else
2298 kfree_skb(skb: skb2);
2299 }
2300 rcu_read_unlock();
2301}
2302EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2303
2304/**
2305 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2306 * @dev: Network device
2307 * @txq: number of queues available
2308 *
2309 * If real_num_tx_queues is changed the tc mappings may no longer be
2310 * valid. To resolve this verify the tc mapping remains valid and if
2311 * not NULL the mapping. With no priorities mapping to this
2312 * offset/count pair it will no longer be used. In the worst case TC0
2313 * is invalid nothing can be done so disable priority mappings. If is
2314 * expected that drivers will fix this mapping if they can before
2315 * calling netif_set_real_num_tx_queues.
2316 */
2317static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2318{
2319 int i;
2320 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2321
2322 /* If TC0 is invalidated disable TC mapping */
2323 if (tc->offset + tc->count > txq) {
2324 netdev_warn(dev, format: "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2325 dev->num_tc = 0;
2326 return;
2327 }
2328
2329 /* Invalidated prio to tc mappings set to TC0 */
2330 for (i = 1; i < TC_BITMASK + 1; i++) {
2331 int q = netdev_get_prio_tc_map(dev, prio: i);
2332
2333 tc = &dev->tc_to_txq[q];
2334 if (tc->offset + tc->count > txq) {
2335 netdev_warn(dev, format: "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2336 i, q);
2337 netdev_set_prio_tc_map(dev, prio: i, tc: 0);
2338 }
2339 }
2340}
2341
2342int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2343{
2344 if (dev->num_tc) {
2345 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2346 int i;
2347
2348 /* walk through the TCs and see if it falls into any of them */
2349 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2350 if ((txq - tc->offset) < tc->count)
2351 return i;
2352 }
2353
2354 /* didn't find it, just return -1 to indicate no match */
2355 return -1;
2356 }
2357
2358 return 0;
2359}
2360EXPORT_SYMBOL(netdev_txq_to_tc);
2361
2362#ifdef CONFIG_XPS
2363static struct static_key xps_needed __read_mostly;
2364static struct static_key xps_rxqs_needed __read_mostly;
2365static DEFINE_MUTEX(xps_map_mutex);
2366#define xmap_dereference(P) \
2367 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2368
2369static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2370 struct xps_dev_maps *old_maps, int tci, u16 index)
2371{
2372 struct xps_map *map = NULL;
2373 int pos;
2374
2375 map = xmap_dereference(dev_maps->attr_map[tci]);
2376 if (!map)
2377 return false;
2378
2379 for (pos = map->len; pos--;) {
2380 if (map->queues[pos] != index)
2381 continue;
2382
2383 if (map->len > 1) {
2384 map->queues[pos] = map->queues[--map->len];
2385 break;
2386 }
2387
2388 if (old_maps)
2389 RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
2390 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2391 kfree_rcu(map, rcu);
2392 return false;
2393 }
2394
2395 return true;
2396}
2397
2398static bool remove_xps_queue_cpu(struct net_device *dev,
2399 struct xps_dev_maps *dev_maps,
2400 int cpu, u16 offset, u16 count)
2401{
2402 int num_tc = dev_maps->num_tc;
2403 bool active = false;
2404 int tci;
2405
2406 for (tci = cpu * num_tc; num_tc--; tci++) {
2407 int i, j;
2408
2409 for (i = count, j = offset; i--; j++) {
2410 if (!remove_xps_queue(dev_maps, NULL, tci, index: j))
2411 break;
2412 }
2413
2414 active |= i < 0;
2415 }
2416
2417 return active;
2418}
2419
2420static void reset_xps_maps(struct net_device *dev,
2421 struct xps_dev_maps *dev_maps,
2422 enum xps_map_type type)
2423{
2424 static_key_slow_dec_cpuslocked(key: &xps_needed);
2425 if (type == XPS_RXQS)
2426 static_key_slow_dec_cpuslocked(key: &xps_rxqs_needed);
2427
2428 RCU_INIT_POINTER(dev->xps_maps[type], NULL);
2429
2430 kfree_rcu(dev_maps, rcu);
2431}
2432
2433static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
2434 u16 offset, u16 count)
2435{
2436 struct xps_dev_maps *dev_maps;
2437 bool active = false;
2438 int i, j;
2439
2440 dev_maps = xmap_dereference(dev->xps_maps[type]);
2441 if (!dev_maps)
2442 return;
2443
2444 for (j = 0; j < dev_maps->nr_ids; j++)
2445 active |= remove_xps_queue_cpu(dev, dev_maps, cpu: j, offset, count);
2446 if (!active)
2447 reset_xps_maps(dev, dev_maps, type);
2448
2449 if (type == XPS_CPUS) {
2450 for (i = offset + (count - 1); count--; i--)
2451 netdev_queue_numa_node_write(
2452 q: netdev_get_tx_queue(dev, index: i), NUMA_NO_NODE);
2453 }
2454}
2455
2456static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2457 u16 count)
2458{
2459 if (!static_key_false(key: &xps_needed))
2460 return;
2461
2462 cpus_read_lock();
2463 mutex_lock(&xps_map_mutex);
2464
2465 if (static_key_false(key: &xps_rxqs_needed))
2466 clean_xps_maps(dev, type: XPS_RXQS, offset, count);
2467
2468 clean_xps_maps(dev, type: XPS_CPUS, offset, count);
2469
2470 mutex_unlock(lock: &xps_map_mutex);
2471 cpus_read_unlock();
2472}
2473
2474static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2475{
2476 netif_reset_xps_queues(dev, offset: index, count: dev->num_tx_queues - index);
2477}
2478
2479static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2480 u16 index, bool is_rxqs_map)
2481{
2482 struct xps_map *new_map;
2483 int alloc_len = XPS_MIN_MAP_ALLOC;
2484 int i, pos;
2485
2486 for (pos = 0; map && pos < map->len; pos++) {
2487 if (map->queues[pos] != index)
2488 continue;
2489 return map;
2490 }
2491
2492 /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2493 if (map) {
2494 if (pos < map->alloc_len)
2495 return map;
2496
2497 alloc_len = map->alloc_len * 2;
2498 }
2499
2500 /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2501 * map
2502 */
2503 if (is_rxqs_map)
2504 new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2505 else
2506 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2507 cpu_to_node(cpu: attr_index));
2508 if (!new_map)
2509 return NULL;
2510
2511 for (i = 0; i < pos; i++)
2512 new_map->queues[i] = map->queues[i];
2513 new_map->alloc_len = alloc_len;
2514 new_map->len = pos;
2515
2516 return new_map;
2517}
2518
2519/* Copy xps maps at a given index */
2520static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
2521 struct xps_dev_maps *new_dev_maps, int index,
2522 int tc, bool skip_tc)
2523{
2524 int i, tci = index * dev_maps->num_tc;
2525 struct xps_map *map;
2526
2527 /* copy maps belonging to foreign traffic classes */
2528 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2529 if (i == tc && skip_tc)
2530 continue;
2531
2532 /* fill in the new device map from the old device map */
2533 map = xmap_dereference(dev_maps->attr_map[tci]);
2534 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2535 }
2536}
2537
2538/* Must be called under cpus_read_lock */
2539int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2540 u16 index, enum xps_map_type type)
2541{
2542 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
2543 const unsigned long *online_mask = NULL;
2544 bool active = false, copy = false;
2545 int i, j, tci, numa_node_id = -2;
2546 int maps_sz, num_tc = 1, tc = 0;
2547 struct xps_map *map, *new_map;
2548 unsigned int nr_ids;
2549
2550 WARN_ON_ONCE(index >= dev->num_tx_queues);
2551
2552 if (dev->num_tc) {
2553 /* Do not allow XPS on subordinate device directly */
2554 num_tc = dev->num_tc;
2555 if (num_tc < 0)
2556 return -EINVAL;
2557
2558 /* If queue belongs to subordinate dev use its map */
2559 dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2560
2561 tc = netdev_txq_to_tc(dev, index);
2562 if (tc < 0)
2563 return -EINVAL;
2564 }
2565
2566 mutex_lock(&xps_map_mutex);
2567
2568 dev_maps = xmap_dereference(dev->xps_maps[type]);
2569 if (type == XPS_RXQS) {
2570 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2571 nr_ids = dev->num_rx_queues;
2572 } else {
2573 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2574 if (num_possible_cpus() > 1)
2575 online_mask = cpumask_bits(cpu_online_mask);
2576 nr_ids = nr_cpu_ids;
2577 }
2578
2579 if (maps_sz < L1_CACHE_BYTES)
2580 maps_sz = L1_CACHE_BYTES;
2581
2582 /* The old dev_maps could be larger or smaller than the one we're
2583 * setting up now, as dev->num_tc or nr_ids could have been updated in
2584 * between. We could try to be smart, but let's be safe instead and only
2585 * copy foreign traffic classes if the two map sizes match.
2586 */
2587 if (dev_maps &&
2588 dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
2589 copy = true;
2590
2591 /* allocate memory for queue storage */
2592 for (j = -1; j = netif_attrmask_next_and(n: j, src1p: online_mask, src2p: mask, nr_bits: nr_ids),
2593 j < nr_ids;) {
2594 if (!new_dev_maps) {
2595 new_dev_maps = kzalloc(size: maps_sz, GFP_KERNEL);
2596 if (!new_dev_maps) {
2597 mutex_unlock(lock: &xps_map_mutex);
2598 return -ENOMEM;
2599 }
2600
2601 new_dev_maps->nr_ids = nr_ids;
2602 new_dev_maps->num_tc = num_tc;
2603 }
2604
2605 tci = j * num_tc + tc;
2606 map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
2607
2608 map = expand_xps_map(map, attr_index: j, index, is_rxqs_map: type == XPS_RXQS);
2609 if (!map)
2610 goto error;
2611
2612 RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2613 }
2614
2615 if (!new_dev_maps)
2616 goto out_no_new_maps;
2617
2618 if (!dev_maps) {
2619 /* Increment static keys at most once per type */
2620 static_key_slow_inc_cpuslocked(key: &xps_needed);
2621 if (type == XPS_RXQS)
2622 static_key_slow_inc_cpuslocked(key: &xps_rxqs_needed);
2623 }
2624
2625 for (j = 0; j < nr_ids; j++) {
2626 bool skip_tc = false;
2627
2628 tci = j * num_tc + tc;
2629 if (netif_attr_test_mask(j, mask, nr_bits: nr_ids) &&
2630 netif_attr_test_online(j, online_mask, nr_bits: nr_ids)) {
2631 /* add tx-queue to CPU/rx-queue maps */
2632 int pos = 0;
2633
2634 skip_tc = true;
2635
2636 map = xmap_dereference(new_dev_maps->attr_map[tci]);
2637 while ((pos < map->len) && (map->queues[pos] != index))
2638 pos++;
2639
2640 if (pos == map->len)
2641 map->queues[map->len++] = index;
2642#ifdef CONFIG_NUMA
2643 if (type == XPS_CPUS) {
2644 if (numa_node_id == -2)
2645 numa_node_id = cpu_to_node(cpu: j);
2646 else if (numa_node_id != cpu_to_node(cpu: j))
2647 numa_node_id = -1;
2648 }
2649#endif
2650 }
2651
2652 if (copy)
2653 xps_copy_dev_maps(dev_maps, new_dev_maps, index: j, tc,
2654 skip_tc);
2655 }
2656
2657 rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
2658
2659 /* Cleanup old maps */
2660 if (!dev_maps)
2661 goto out_no_old_maps;
2662
2663 for (j = 0; j < dev_maps->nr_ids; j++) {
2664 for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
2665 map = xmap_dereference(dev_maps->attr_map[tci]);
2666 if (!map)
2667 continue;
2668
2669 if (copy) {
2670 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2671 if (map == new_map)
2672 continue;
2673 }
2674
2675 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2676 kfree_rcu(map, rcu);
2677 }
2678 }
2679
2680 old_dev_maps = dev_maps;
2681
2682out_no_old_maps:
2683 dev_maps = new_dev_maps;
2684 active = true;
2685
2686out_no_new_maps:
2687 if (type == XPS_CPUS)
2688 /* update Tx queue numa node */
2689 netdev_queue_numa_node_write(q: netdev_get_tx_queue(dev, index),
2690 node: (numa_node_id >= 0) ?
2691 numa_node_id : NUMA_NO_NODE);
2692
2693 if (!dev_maps)
2694 goto out_no_maps;
2695
2696 /* removes tx-queue from unused CPUs/rx-queues */
2697 for (j = 0; j < dev_maps->nr_ids; j++) {
2698 tci = j * dev_maps->num_tc;
2699
2700 for (i = 0; i < dev_maps->num_tc; i++, tci++) {
2701 if (i == tc &&
2702 netif_attr_test_mask(j, mask, nr_bits: dev_maps->nr_ids) &&
2703 netif_attr_test_online(j, online_mask, nr_bits: dev_maps->nr_ids))
2704 continue;
2705
2706 active |= remove_xps_queue(dev_maps,
2707 old_maps: copy ? old_dev_maps : NULL,
2708 tci, index);
2709 }
2710 }
2711
2712 if (old_dev_maps)
2713 kfree_rcu(old_dev_maps, rcu);
2714
2715 /* free map if not active */
2716 if (!active)
2717 reset_xps_maps(dev, dev_maps, type);
2718
2719out_no_maps:
2720 mutex_unlock(lock: &xps_map_mutex);
2721
2722 return 0;
2723error:
2724 /* remove any maps that we added */
2725 for (j = 0; j < nr_ids; j++) {
2726 for (i = num_tc, tci = j * num_tc; i--; tci++) {
2727 new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2728 map = copy ?
2729 xmap_dereference(dev_maps->attr_map[tci]) :
2730 NULL;
2731 if (new_map && new_map != map)
2732 kfree(objp: new_map);
2733 }
2734 }
2735
2736 mutex_unlock(lock: &xps_map_mutex);
2737
2738 kfree(objp: new_dev_maps);
2739 return -ENOMEM;
2740}
2741EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2742
2743int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2744 u16 index)
2745{
2746 int ret;
2747
2748 cpus_read_lock();
2749 ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
2750 cpus_read_unlock();
2751
2752 return ret;
2753}
2754EXPORT_SYMBOL(netif_set_xps_queue);
2755
2756#endif
2757static void netdev_unbind_all_sb_channels(struct net_device *dev)
2758{
2759 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2760
2761 /* Unbind any subordinate channels */
2762 while (txq-- != &dev->_tx[0]) {
2763 if (txq->sb_dev)
2764 netdev_unbind_sb_channel(dev, sb_dev: txq->sb_dev);
2765 }
2766}
2767
2768void netdev_reset_tc(struct net_device *dev)
2769{
2770#ifdef CONFIG_XPS
2771 netif_reset_xps_queues_gt(dev, index: 0);
2772#endif
2773 netdev_unbind_all_sb_channels(dev);
2774
2775 /* Reset TC configuration of device */
2776 dev->num_tc = 0;
2777 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2778 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2779}
2780EXPORT_SYMBOL(netdev_reset_tc);
2781
2782int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2783{
2784 if (tc >= dev->num_tc)
2785 return -EINVAL;
2786
2787#ifdef CONFIG_XPS
2788 netif_reset_xps_queues(dev, offset, count);
2789#endif
2790 dev->tc_to_txq[tc].count = count;
2791 dev->tc_to_txq[tc].offset = offset;
2792 return 0;
2793}
2794EXPORT_SYMBOL(netdev_set_tc_queue);
2795
2796int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2797{
2798 if (num_tc > TC_MAX_QUEUE)
2799 return -EINVAL;
2800
2801#ifdef CONFIG_XPS
2802 netif_reset_xps_queues_gt(dev, index: 0);
2803#endif
2804 netdev_unbind_all_sb_channels(dev);
2805
2806 dev->num_tc = num_tc;
2807 return 0;
2808}
2809EXPORT_SYMBOL(netdev_set_num_tc);
2810
2811void netdev_unbind_sb_channel(struct net_device *dev,
2812 struct net_device *sb_dev)
2813{
2814 struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2815
2816#ifdef CONFIG_XPS
2817 netif_reset_xps_queues_gt(dev: sb_dev, index: 0);
2818#endif
2819 memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2820 memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2821
2822 while (txq-- != &dev->_tx[0]) {
2823 if (txq->sb_dev == sb_dev)
2824 txq->sb_dev = NULL;
2825 }
2826}
2827EXPORT_SYMBOL(netdev_unbind_sb_channel);
2828
2829int netdev_bind_sb_channel_queue(struct net_device *dev,
2830 struct net_device *sb_dev,
2831 u8 tc, u16 count, u16 offset)
2832{
2833 /* Make certain the sb_dev and dev are already configured */
2834 if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2835 return -EINVAL;
2836
2837 /* We cannot hand out queues we don't have */
2838 if ((offset + count) > dev->real_num_tx_queues)
2839 return -EINVAL;
2840
2841 /* Record the mapping */
2842 sb_dev->tc_to_txq[tc].count = count;
2843 sb_dev->tc_to_txq[tc].offset = offset;
2844
2845 /* Provide a way for Tx queue to find the tc_to_txq map or
2846 * XPS map for itself.
2847 */
2848 while (count--)
2849 netdev_get_tx_queue(dev, index: count + offset)->sb_dev = sb_dev;
2850
2851 return 0;
2852}
2853EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2854
2855int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2856{
2857 /* Do not use a multiqueue device to represent a subordinate channel */
2858 if (netif_is_multiqueue(dev))
2859 return -ENODEV;
2860
2861 /* We allow channels 1 - 32767 to be used for subordinate channels.
2862 * Channel 0 is meant to be "native" mode and used only to represent
2863 * the main root device. We allow writing 0 to reset the device back
2864 * to normal mode after being used as a subordinate channel.
2865 */
2866 if (channel > S16_MAX)
2867 return -EINVAL;
2868
2869 dev->num_tc = -channel;
2870
2871 return 0;
2872}
2873EXPORT_SYMBOL(netdev_set_sb_channel);
2874
2875/*
2876 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2877 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2878 */
2879int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2880{
2881 bool disabling;
2882 int rc;
2883
2884 disabling = txq < dev->real_num_tx_queues;
2885
2886 if (txq < 1 || txq > dev->num_tx_queues)
2887 return -EINVAL;
2888
2889 if (dev->reg_state == NETREG_REGISTERED ||
2890 dev->reg_state == NETREG_UNREGISTERING) {
2891 ASSERT_RTNL();
2892
2893 rc = netdev_queue_update_kobjects(net: dev, old_num: dev->real_num_tx_queues,
2894 new_num: txq);
2895 if (rc)
2896 return rc;
2897
2898 if (dev->num_tc)
2899 netif_setup_tc(dev, txq);
2900
2901 dev_qdisc_change_real_num_tx(dev, new_real_tx: txq);
2902
2903 dev->real_num_tx_queues = txq;
2904
2905 if (disabling) {
2906 synchronize_net();
2907 qdisc_reset_all_tx_gt(dev, i: txq);
2908#ifdef CONFIG_XPS
2909 netif_reset_xps_queues_gt(dev, index: txq);
2910#endif
2911 }
2912 } else {
2913 dev->real_num_tx_queues = txq;
2914 }
2915
2916 return 0;
2917}
2918EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2919
2920#ifdef CONFIG_SYSFS
2921/**
2922 * netif_set_real_num_rx_queues - set actual number of RX queues used
2923 * @dev: Network device
2924 * @rxq: Actual number of RX queues
2925 *
2926 * This must be called either with the rtnl_lock held or before
2927 * registration of the net device. Returns 0 on success, or a
2928 * negative error code. If called before registration, it always
2929 * succeeds.
2930 */
2931int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2932{
2933 int rc;
2934
2935 if (rxq < 1 || rxq > dev->num_rx_queues)
2936 return -EINVAL;
2937
2938 if (dev->reg_state == NETREG_REGISTERED) {
2939 ASSERT_RTNL();
2940
2941 rc = net_rx_queue_update_kobjects(dev, old_num: dev->real_num_rx_queues,
2942 new_num: rxq);
2943 if (rc)
2944 return rc;
2945 }
2946
2947 dev->real_num_rx_queues = rxq;
2948 return 0;
2949}
2950EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2951#endif
2952
2953/**
2954 * netif_set_real_num_queues - set actual number of RX and TX queues used
2955 * @dev: Network device
2956 * @txq: Actual number of TX queues
2957 * @rxq: Actual number of RX queues
2958 *
2959 * Set the real number of both TX and RX queues.
2960 * Does nothing if the number of queues is already correct.
2961 */
2962int netif_set_real_num_queues(struct net_device *dev,
2963 unsigned int txq, unsigned int rxq)
2964{
2965 unsigned int old_rxq = dev->real_num_rx_queues;
2966 int err;
2967
2968 if (txq < 1 || txq > dev->num_tx_queues ||
2969 rxq < 1 || rxq > dev->num_rx_queues)
2970 return -EINVAL;
2971
2972 /* Start from increases, so the error path only does decreases -
2973 * decreases can't fail.
2974 */
2975 if (rxq > dev->real_num_rx_queues) {
2976 err = netif_set_real_num_rx_queues(dev, rxq);
2977 if (err)
2978 return err;
2979 }
2980 if (txq > dev->real_num_tx_queues) {
2981 err = netif_set_real_num_tx_queues(dev, txq);
2982 if (err)
2983 goto undo_rx;
2984 }
2985 if (rxq < dev->real_num_rx_queues)
2986 WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
2987 if (txq < dev->real_num_tx_queues)
2988 WARN_ON(netif_set_real_num_tx_queues(dev, txq));
2989
2990 return 0;
2991undo_rx:
2992 WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
2993 return err;
2994}
2995EXPORT_SYMBOL(netif_set_real_num_queues);
2996
2997/**
2998 * netif_set_tso_max_size() - set the max size of TSO frames supported
2999 * @dev: netdev to update
3000 * @size: max skb->len of a TSO frame
3001 *
3002 * Set the limit on the size of TSO super-frames the device can handle.
3003 * Unless explicitly set the stack will assume the value of
3004 * %GSO_LEGACY_MAX_SIZE.
3005 */
3006void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
3007{
3008 dev->tso_max_size = min(GSO_MAX_SIZE, size);
3009 if (size < READ_ONCE(dev->gso_max_size))
3010 netif_set_gso_max_size(dev, size);
3011 if (size < READ_ONCE(dev->gso_ipv4_max_size))
3012 netif_set_gso_ipv4_max_size(dev, size);
3013}
3014EXPORT_SYMBOL(netif_set_tso_max_size);
3015
3016/**
3017 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
3018 * @dev: netdev to update
3019 * @segs: max number of TCP segments
3020 *
3021 * Set the limit on the number of TCP segments the device can generate from
3022 * a single TSO super-frame.
3023 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
3024 */
3025void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
3026{
3027 dev->tso_max_segs = segs;
3028 if (segs < READ_ONCE(dev->gso_max_segs))
3029 netif_set_gso_max_segs(dev, segs);
3030}
3031EXPORT_SYMBOL(netif_set_tso_max_segs);
3032
3033/**
3034 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
3035 * @to: netdev to update
3036 * @from: netdev from which to copy the limits
3037 */
3038void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
3039{
3040 netif_set_tso_max_size(to, from->tso_max_size);
3041 netif_set_tso_max_segs(to, from->tso_max_segs);
3042}
3043EXPORT_SYMBOL(netif_inherit_tso_max);
3044
3045/**
3046 * netif_get_num_default_rss_queues - default number of RSS queues
3047 *
3048 * Default value is the number of physical cores if there are only 1 or 2, or
3049 * divided by 2 if there are more.
3050 */
3051int netif_get_num_default_rss_queues(void)
3052{
3053 cpumask_var_t cpus;
3054 int cpu, count = 0;
3055
3056 if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
3057 return 1;
3058
3059 cpumask_copy(dstp: cpus, cpu_online_mask);
3060 for_each_cpu(cpu, cpus) {
3061 ++count;
3062 cpumask_andnot(dstp: cpus, src1p: cpus, topology_sibling_cpumask(cpu));
3063 }
3064 free_cpumask_var(mask: cpus);
3065
3066 return count > 2 ? DIV_ROUND_UP(count, 2) : count;
3067}
3068EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3069
3070static void __netif_reschedule(struct Qdisc *q)
3071{
3072 struct softnet_data *sd;
3073 unsigned long flags;
3074
3075 local_irq_save(flags);
3076 sd = this_cpu_ptr(&softnet_data);
3077 q->next_sched = NULL;
3078 *sd->output_queue_tailp = q;
3079 sd->output_queue_tailp = &q->next_sched;
3080 raise_softirq_irqoff(nr: NET_TX_SOFTIRQ);
3081 local_irq_restore(flags);
3082}
3083
3084void __netif_schedule(struct Qdisc *q)
3085{
3086 if (!test_and_set_bit(nr: __QDISC_STATE_SCHED, addr: &q->state))
3087 __netif_reschedule(q);
3088}
3089EXPORT_SYMBOL(__netif_schedule);
3090
3091struct dev_kfree_skb_cb {
3092 enum skb_drop_reason reason;
3093};
3094
3095static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3096{
3097 return (struct dev_kfree_skb_cb *)skb->cb;
3098}
3099
3100void netif_schedule_queue(struct netdev_queue *txq)
3101{
3102 rcu_read_lock();
3103 if (!netif_xmit_stopped(dev_queue: txq)) {
3104 struct Qdisc *q = rcu_dereference(txq->qdisc);
3105
3106 __netif_schedule(q);
3107 }
3108 rcu_read_unlock();
3109}
3110EXPORT_SYMBOL(netif_schedule_queue);
3111
3112void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3113{
3114 if (test_and_clear_bit(nr: __QUEUE_STATE_DRV_XOFF, addr: &dev_queue->state)) {
3115 struct Qdisc *q;
3116
3117 rcu_read_lock();
3118 q = rcu_dereference(dev_queue->qdisc);
3119 __netif_schedule(q);
3120 rcu_read_unlock();
3121 }
3122}
3123EXPORT_SYMBOL(netif_tx_wake_queue);
3124
3125void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3126{
3127 unsigned long flags;
3128
3129 if (unlikely(!skb))
3130 return;
3131
3132 if (likely(refcount_read(&skb->users) == 1)) {
3133 smp_rmb();
3134 refcount_set(r: &skb->users, n: 0);
3135 } else if (likely(!refcount_dec_and_test(&skb->users))) {
3136 return;
3137 }
3138 get_kfree_skb_cb(skb)->reason = reason;
3139 local_irq_save(flags);
3140 skb->next = __this_cpu_read(softnet_data.completion_queue);
3141 __this_cpu_write(softnet_data.completion_queue, skb);
3142 raise_softirq_irqoff(nr: NET_TX_SOFTIRQ);
3143 local_irq_restore(flags);
3144}
3145EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
3146
3147void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
3148{
3149 if (in_hardirq() || irqs_disabled())
3150 dev_kfree_skb_irq_reason(skb, reason);
3151 else
3152 kfree_skb_reason(skb, reason);
3153}
3154EXPORT_SYMBOL(dev_kfree_skb_any_reason);
3155
3156
3157/**
3158 * netif_device_detach - mark device as removed
3159 * @dev: network device
3160 *
3161 * Mark device as removed from system and therefore no longer available.
3162 */
3163void netif_device_detach(struct net_device *dev)
3164{
3165 if (test_and_clear_bit(nr: __LINK_STATE_PRESENT, addr: &dev->state) &&
3166 netif_running(dev)) {
3167 netif_tx_stop_all_queues(dev);
3168 }
3169}
3170EXPORT_SYMBOL(netif_device_detach);
3171
3172/**
3173 * netif_device_attach - mark device as attached
3174 * @dev: network device
3175 *
3176 * Mark device as attached from system and restart if needed.
3177 */
3178void netif_device_attach(struct net_device *dev)
3179{
3180 if (!test_and_set_bit(nr: __LINK_STATE_PRESENT, addr: &dev->state) &&
3181 netif_running(dev)) {
3182 netif_tx_wake_all_queues(dev);
3183 __netdev_watchdog_up(dev);
3184 }
3185}
3186EXPORT_SYMBOL(netif_device_attach);
3187
3188/*
3189 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3190 * to be used as a distribution range.
3191 */
3192static u16 skb_tx_hash(const struct net_device *dev,
3193 const struct net_device *sb_dev,
3194 struct sk_buff *skb)
3195{
3196 u32 hash;
3197 u16 qoffset = 0;
3198 u16 qcount = dev->real_num_tx_queues;
3199
3200 if (dev->num_tc) {
3201 u8 tc = netdev_get_prio_tc_map(dev, prio: skb->priority);
3202
3203 qoffset = sb_dev->tc_to_txq[tc].offset;
3204 qcount = sb_dev->tc_to_txq[tc].count;
3205 if (unlikely(!qcount)) {
3206 net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
3207 sb_dev->name, qoffset, tc);
3208 qoffset = 0;
3209 qcount = dev->real_num_tx_queues;
3210 }
3211 }
3212
3213 if (skb_rx_queue_recorded(skb)) {
3214 DEBUG_NET_WARN_ON_ONCE(qcount == 0);
3215 hash = skb_get_rx_queue(skb);
3216 if (hash >= qoffset)
3217 hash -= qoffset;
3218 while (unlikely(hash >= qcount))
3219 hash -= qcount;
3220 return hash + qoffset;
3221 }
3222
3223 return (u16) reciprocal_scale(val: skb_get_hash(skb), ep_ro: qcount) + qoffset;
3224}
3225
3226void skb_warn_bad_offload(const struct sk_buff *skb)
3227{
3228 static const netdev_features_t null_features;
3229 struct net_device *dev = skb->dev;
3230 const char *name = "";
3231
3232 if (!net_ratelimit())
3233 return;
3234
3235 if (dev) {
3236 if (dev->dev.parent)
3237 name = dev_driver_string(dev: dev->dev.parent);
3238 else
3239 name = netdev_name(dev);
3240 }
3241 skb_dump(KERN_WARNING, skb, full_pkt: false);
3242 WARN(1, "%s: caps=(%pNF, %pNF)\n",
3243 name, dev ? &dev->features : &null_features,
3244 skb->sk ? &skb->sk->sk_route_caps : &null_features);
3245}
3246
3247/*
3248 * Invalidate hardware checksum when packet is to be mangled, and
3249 * complete checksum manually on outgoing path.
3250 */
3251int skb_checksum_help(struct sk_buff *skb)
3252{
3253 __wsum csum;
3254 int ret = 0, offset;
3255
3256 if (skb->ip_summed == CHECKSUM_COMPLETE)
3257 goto out_set_summed;
3258
3259 if (unlikely(skb_is_gso(skb))) {
3260 skb_warn_bad_offload(skb);
3261 return -EINVAL;
3262 }
3263
3264 /* Before computing a checksum, we should make sure no frag could
3265 * be modified by an external entity : checksum could be wrong.
3266 */
3267 if (skb_has_shared_frag(skb)) {
3268 ret = __skb_linearize(skb);
3269 if (ret)
3270 goto out;
3271 }
3272
3273 offset = skb_checksum_start_offset(skb);
3274 ret = -EINVAL;
3275 if (unlikely(offset >= skb_headlen(skb))) {
3276 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3277 WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
3278 offset, skb_headlen(skb));
3279 goto out;
3280 }
3281 csum = skb_checksum(skb, offset, len: skb->len - offset, csum: 0);
3282
3283 offset += skb->csum_offset;
3284 if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
3285 DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
3286 WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
3287 offset + sizeof(__sum16), skb_headlen(skb));
3288 goto out;
3289 }
3290 ret = skb_ensure_writable(skb, write_len: offset + sizeof(__sum16));
3291 if (ret)
3292 goto out;
3293
3294 *(__sum16 *)(skb->data + offset) = csum_fold(sum: csum) ?: CSUM_MANGLED_0;
3295out_set_summed:
3296 skb->ip_summed = CHECKSUM_NONE;
3297out:
3298 return ret;
3299}
3300EXPORT_SYMBOL(skb_checksum_help);
3301
3302int skb_crc32c_csum_help(struct sk_buff *skb)
3303{
3304 __le32 crc32c_csum;
3305 int ret = 0, offset, start;
3306
3307 if (skb->ip_summed != CHECKSUM_PARTIAL)
3308 goto out;
3309
3310 if (unlikely(skb_is_gso(skb)))
3311 goto out;
3312
3313 /* Before computing a checksum, we should make sure no frag could
3314 * be modified by an external entity : checksum could be wrong.
3315 */
3316 if (unlikely(skb_has_shared_frag(skb))) {
3317 ret = __skb_linearize(skb);
3318 if (ret)
3319 goto out;
3320 }
3321 start = skb_checksum_start_offset(skb);
3322 offset = start + offsetof(struct sctphdr, checksum);
3323 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3324 ret = -EINVAL;
3325 goto out;
3326 }
3327
3328 ret = skb_ensure_writable(skb, write_len: offset + sizeof(__le32));
3329 if (ret)
3330 goto out;
3331
3332 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3333 skb->len - start, ~(__u32)0,
3334 crc32c_csum_stub));
3335 *(__le32 *)(skb->data + offset) = crc32c_csum;
3336 skb_reset_csum_not_inet(skb);
3337out:
3338 return ret;
3339}
3340
3341__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3342{
3343 __be16 type = skb->protocol;
3344
3345 /* Tunnel gso handlers can set protocol to ethernet. */
3346 if (type == htons(ETH_P_TEB)) {
3347 struct ethhdr *eth;
3348
3349 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3350 return 0;
3351
3352 eth = (struct ethhdr *)skb->data;
3353 type = eth->h_proto;
3354 }
3355
3356 return vlan_get_protocol_and_depth(skb, type, depth);
3357}
3358
3359
3360/* Take action when hardware reception checksum errors are detected. */
3361#ifdef CONFIG_BUG
3362static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3363{
3364 netdev_err(dev, format: "hw csum failure\n");
3365 skb_dump(KERN_ERR, skb, full_pkt: true);
3366 dump_stack();
3367}
3368
3369void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3370{
3371 DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
3372}
3373EXPORT_SYMBOL(netdev_rx_csum_fault);
3374#endif
3375
3376/* XXX: check that highmem exists at all on the given machine. */
3377static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3378{
3379#ifdef CONFIG_HIGHMEM
3380 int i;
3381
3382 if (!(dev->features & NETIF_F_HIGHDMA)) {
3383 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3384 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3385
3386 if (PageHighMem(skb_frag_page(frag)))
3387 return 1;
3388 }
3389 }
3390#endif
3391 return 0;
3392}
3393
3394/* If MPLS offload request, verify we are testing hardware MPLS features
3395 * instead of standard features for the netdev.
3396 */
3397#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3398static netdev_features_t net_mpls_features(struct sk_buff *skb,
3399 netdev_features_t features,
3400 __be16 type)
3401{
3402 if (eth_p_mpls(eth_type: type))
3403 features &= skb->dev->mpls_features;
3404
3405 return features;
3406}
3407#else
3408static netdev_features_t net_mpls_features(struct sk_buff *skb,
3409 netdev_features_t features,
3410 __be16 type)
3411{
3412 return features;
3413}
3414#endif
3415
3416static netdev_features_t harmonize_features(struct sk_buff *skb,
3417 netdev_features_t features)
3418{
3419 __be16 type;
3420
3421 type = skb_network_protocol(skb, NULL);
3422 features = net_mpls_features(skb, features, type);
3423
3424 if (skb->ip_summed != CHECKSUM_NONE &&
3425 !can_checksum_protocol(features, protocol: type)) {
3426 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3427 }
3428 if (illegal_highdma(dev: skb->dev, skb))
3429 features &= ~NETIF_F_SG;
3430
3431 return features;
3432}
3433
3434netdev_features_t passthru_features_check(struct sk_buff *skb,
3435 struct net_device *dev,
3436 netdev_features_t features)
3437{
3438 return features;
3439}
3440EXPORT_SYMBOL(passthru_features_check);
3441
3442static netdev_features_t dflt_features_check(struct sk_buff *skb,
3443 struct net_device *dev,
3444 netdev_features_t features)
3445{
3446 return vlan_features_check(skb, features);
3447}
3448
3449static netdev_features_t gso_features_check(const struct sk_buff *skb,
3450 struct net_device *dev,
3451 netdev_features_t features)
3452{
3453 u16 gso_segs = skb_shinfo(skb)->gso_segs;
3454
3455 if (gso_segs > READ_ONCE(dev->gso_max_segs))
3456 return features & ~NETIF_F_GSO_MASK;
3457
3458 if (unlikely(skb->len >= READ_ONCE(dev->gso_max_size)))
3459 return features & ~NETIF_F_GSO_MASK;
3460
3461 if (!skb_shinfo(skb)->gso_type) {
3462 skb_warn_bad_offload(skb);
3463 return features & ~NETIF_F_GSO_MASK;
3464 }
3465
3466 /* Support for GSO partial features requires software
3467 * intervention before we can actually process the packets
3468 * so we need to strip support for any partial features now
3469 * and we can pull them back in after we have partially
3470 * segmented the frame.
3471 */
3472 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3473 features &= ~dev->gso_partial_features;
3474
3475 /* Make sure to clear the IPv4 ID mangling feature if the
3476 * IPv4 header has the potential to be fragmented.
3477 */
3478 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3479 struct iphdr *iph = skb->encapsulation ?
3480 inner_ip_hdr(skb) : ip_hdr(skb);
3481
3482 if (!(iph->frag_off & htons(IP_DF)))
3483 features &= ~NETIF_F_TSO_MANGLEID;
3484 }
3485
3486 return features;
3487}
3488
3489netdev_features_t netif_skb_features(struct sk_buff *skb)
3490{
3491 struct net_device *dev = skb->dev;
3492 netdev_features_t features = dev->features;
3493
3494 if (skb_is_gso(skb))
3495 features = gso_features_check(skb, dev, features);
3496
3497 /* If encapsulation offload request, verify we are testing
3498 * hardware encapsulation features instead of standard
3499 * features for the netdev
3500 */
3501 if (skb->encapsulation)
3502 features &= dev->hw_enc_features;
3503
3504 if (skb_vlan_tagged(skb))
3505 features = netdev_intersect_features(f1: features,
3506 f2: dev->vlan_features |
3507 NETIF_F_HW_VLAN_CTAG_TX |
3508 NETIF_F_HW_VLAN_STAG_TX);
3509
3510 if (dev->netdev_ops->ndo_features_check)
3511 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3512 features);
3513 else
3514 features &= dflt_features_check(skb, dev, features);
3515
3516 return harmonize_features(skb, features);
3517}
3518EXPORT_SYMBOL(netif_skb_features);
3519
3520static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3521 struct netdev_queue *txq, bool more)
3522{
3523 unsigned int len;
3524 int rc;
3525
3526 if (dev_nit_active(dev))
3527 dev_queue_xmit_nit(skb, dev);
3528
3529 len = skb->len;
3530 trace_net_dev_start_xmit(skb, dev);
3531 rc = netdev_start_xmit(skb, dev, txq, more);
3532 trace_net_dev_xmit(skb, rc, dev, skb_len: len);
3533
3534 return rc;
3535}
3536
3537struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3538 struct netdev_queue *txq, int *ret)
3539{
3540 struct sk_buff *skb = first;
3541 int rc = NETDEV_TX_OK;
3542
3543 while (skb) {
3544 struct sk_buff *next = skb->next;
3545
3546 skb_mark_not_on_list(skb);
3547 rc = xmit_one(skb, dev, txq, more: next != NULL);
3548 if (unlikely(!dev_xmit_complete(rc))) {
3549 skb->next = next;
3550 goto out;
3551 }
3552
3553 skb = next;
3554 if (netif_tx_queue_stopped(dev_queue: txq) && skb) {
3555 rc = NETDEV_TX_BUSY;
3556 break;
3557 }
3558 }
3559
3560out:
3561 *ret = rc;
3562 return skb;
3563}
3564
3565static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3566 netdev_features_t features)
3567{
3568 if (skb_vlan_tag_present(skb) &&
3569 !vlan_hw_offload_capable(features, proto: skb->vlan_proto))
3570 skb = __vlan_hwaccel_push_inside(skb);
3571 return skb;
3572}
3573
3574int skb_csum_hwoffload_help(struct sk_buff *skb,
3575 const netdev_features_t features)
3576{
3577 if (unlikely(skb_csum_is_sctp(skb)))
3578 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3579 skb_crc32c_csum_help(skb);
3580
3581 if (features & NETIF_F_HW_CSUM)
3582 return 0;
3583
3584 if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3585 switch (skb->csum_offset) {
3586 case offsetof(struct tcphdr, check):
3587 case offsetof(struct udphdr, check):
3588 return 0;
3589 }
3590 }
3591
3592 return skb_checksum_help(skb);
3593}
3594EXPORT_SYMBOL(skb_csum_hwoffload_help);
3595
3596static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3597{
3598 netdev_features_t features;
3599
3600 features = netif_skb_features(skb);
3601 skb = validate_xmit_vlan(skb, features);
3602 if (unlikely(!skb))
3603 goto out_null;
3604
3605 skb = sk_validate_xmit_skb(skb, dev);
3606 if (unlikely(!skb))
3607 goto out_null;
3608
3609 if (netif_needs_gso(skb, features)) {
3610 struct sk_buff *segs;
3611
3612 segs = skb_gso_segment(skb, features);
3613 if (IS_ERR(ptr: segs)) {
3614 goto out_kfree_skb;
3615 } else if (segs) {
3616 consume_skb(skb);
3617 skb = segs;
3618 }
3619 } else {
3620 if (skb_needs_linearize(skb, features) &&
3621 __skb_linearize(skb))
3622 goto out_kfree_skb;
3623
3624 /* If packet is not checksummed and device does not
3625 * support checksumming for this protocol, complete
3626 * checksumming here.
3627 */
3628 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3629 if (skb->encapsulation)
3630 skb_set_inner_transport_header(skb,
3631 offset: skb_checksum_start_offset(skb));
3632 else
3633 skb_set_transport_header(skb,
3634 offset: skb_checksum_start_offset(skb));
3635 if (skb_csum_hwoffload_help(skb, features))
3636 goto out_kfree_skb;
3637 }
3638 }
3639
3640 skb = validate_xmit_xfrm(skb, features, again);
3641
3642 return skb;
3643
3644out_kfree_skb:
3645 kfree_skb(skb);
3646out_null:
3647 dev_core_stats_tx_dropped_inc(dev);
3648 return NULL;
3649}
3650
3651struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3652{
3653 struct sk_buff *next, *head = NULL, *tail;
3654
3655 for (; skb != NULL; skb = next) {
3656 next = skb->next;
3657 skb_mark_not_on_list(skb);
3658
3659 /* in case skb wont be segmented, point to itself */
3660 skb->prev = skb;
3661
3662 skb = validate_xmit_skb(skb, dev, again);
3663 if (!skb)
3664 continue;
3665
3666 if (!head)
3667 head = skb;
3668 else
3669 tail->