1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/types.h> |
4 | #include <linux/atomic.h> |
5 | #include <linux/inetdevice.h> |
6 | #include <linux/netfilter.h> |
7 | #include <linux/netfilter_ipv4.h> |
8 | #include <linux/netfilter_ipv6.h> |
9 | |
10 | #include <net/netfilter/nf_nat_masquerade.h> |
11 | |
12 | struct masq_dev_work { |
13 | struct work_struct work; |
14 | struct net *net; |
15 | netns_tracker ns_tracker; |
16 | union nf_inet_addr addr; |
17 | int ifindex; |
18 | int (*iter)(struct nf_conn *i, void *data); |
19 | }; |
20 | |
21 | #define MAX_MASQ_WORKER_COUNT 16 |
22 | |
23 | static DEFINE_MUTEX(masq_mutex); |
24 | static unsigned int masq_refcnt __read_mostly; |
25 | static atomic_t masq_worker_count __read_mostly; |
26 | |
27 | unsigned int |
28 | nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, |
29 | const struct nf_nat_range2 *range, |
30 | const struct net_device *out) |
31 | { |
32 | struct nf_conn *ct; |
33 | struct nf_conn_nat *nat; |
34 | enum ip_conntrack_info ctinfo; |
35 | struct nf_nat_range2 newrange; |
36 | const struct rtable *rt; |
37 | __be32 newsrc, nh; |
38 | |
39 | WARN_ON(hooknum != NF_INET_POST_ROUTING); |
40 | |
41 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
42 | |
43 | WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || |
44 | ctinfo == IP_CT_RELATED_REPLY))); |
45 | |
46 | /* Source address is 0.0.0.0 - locally generated packet that is |
47 | * probably not supposed to be masqueraded. |
48 | */ |
49 | if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) |
50 | return NF_ACCEPT; |
51 | |
52 | rt = skb_rtable(skb); |
53 | nh = rt_nexthop(rt, daddr: ip_hdr(skb)->daddr); |
54 | newsrc = inet_select_addr(dev: out, dst: nh, scope: RT_SCOPE_UNIVERSE); |
55 | if (!newsrc) { |
56 | pr_info("%s ate my IP address\n" , out->name); |
57 | return NF_DROP; |
58 | } |
59 | |
60 | nat = nf_ct_nat_ext_add(ct); |
61 | if (nat) |
62 | nat->masq_index = out->ifindex; |
63 | |
64 | /* Transfer from original range. */ |
65 | memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); |
66 | memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); |
67 | newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; |
68 | newrange.min_addr.ip = newsrc; |
69 | newrange.max_addr.ip = newsrc; |
70 | newrange.min_proto = range->min_proto; |
71 | newrange.max_proto = range->max_proto; |
72 | |
73 | /* Hand modified range to generic setup. */ |
74 | return nf_nat_setup_info(ct, range: &newrange, maniptype: NF_NAT_MANIP_SRC); |
75 | } |
76 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); |
77 | |
78 | static void iterate_cleanup_work(struct work_struct *work) |
79 | { |
80 | struct nf_ct_iter_data iter_data = {}; |
81 | struct masq_dev_work *w; |
82 | |
83 | w = container_of(work, struct masq_dev_work, work); |
84 | |
85 | iter_data.net = w->net; |
86 | iter_data.data = (void *)w; |
87 | nf_ct_iterate_cleanup_net(iter: w->iter, iter_data: &iter_data); |
88 | |
89 | put_net_track(net: w->net, tracker: &w->ns_tracker); |
90 | kfree(objp: w); |
91 | atomic_dec(v: &masq_worker_count); |
92 | module_put(THIS_MODULE); |
93 | } |
94 | |
95 | /* Iterate conntrack table in the background and remove conntrack entries |
96 | * that use the device/address being removed. |
97 | * |
98 | * In case too many work items have been queued already or memory allocation |
99 | * fails iteration is skipped, conntrack entries will time out eventually. |
100 | */ |
101 | static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, |
102 | int ifindex, |
103 | int (*iter)(struct nf_conn *i, void *data), |
104 | gfp_t gfp_flags) |
105 | { |
106 | struct masq_dev_work *w; |
107 | |
108 | if (atomic_read(v: &masq_worker_count) > MAX_MASQ_WORKER_COUNT) |
109 | return; |
110 | |
111 | net = maybe_get_net(net); |
112 | if (!net) |
113 | return; |
114 | |
115 | if (!try_module_get(THIS_MODULE)) |
116 | goto err_module; |
117 | |
118 | w = kzalloc(size: sizeof(*w), flags: gfp_flags); |
119 | if (w) { |
120 | /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ |
121 | atomic_inc(v: &masq_worker_count); |
122 | |
123 | INIT_WORK(&w->work, iterate_cleanup_work); |
124 | w->ifindex = ifindex; |
125 | w->net = net; |
126 | netns_tracker_alloc(net, tracker: &w->ns_tracker, gfp: gfp_flags); |
127 | w->iter = iter; |
128 | if (addr) |
129 | w->addr = *addr; |
130 | schedule_work(work: &w->work); |
131 | return; |
132 | } |
133 | |
134 | module_put(THIS_MODULE); |
135 | err_module: |
136 | put_net(net); |
137 | } |
138 | |
139 | static int device_cmp(struct nf_conn *i, void *arg) |
140 | { |
141 | const struct nf_conn_nat *nat = nfct_nat(ct: i); |
142 | const struct masq_dev_work *w = arg; |
143 | |
144 | if (!nat) |
145 | return 0; |
146 | return nat->masq_index == w->ifindex; |
147 | } |
148 | |
149 | static int masq_device_event(struct notifier_block *this, |
150 | unsigned long event, |
151 | void *ptr) |
152 | { |
153 | const struct net_device *dev = netdev_notifier_info_to_dev(info: ptr); |
154 | struct net *net = dev_net(dev); |
155 | |
156 | if (event == NETDEV_DOWN) { |
157 | /* Device was downed. Search entire table for |
158 | * conntracks which were associated with that device, |
159 | * and forget them. |
160 | */ |
161 | |
162 | nf_nat_masq_schedule(net, NULL, ifindex: dev->ifindex, |
163 | iter: device_cmp, GFP_KERNEL); |
164 | } |
165 | |
166 | return NOTIFY_DONE; |
167 | } |
168 | |
169 | static int inet_cmp(struct nf_conn *ct, void *ptr) |
170 | { |
171 | struct nf_conntrack_tuple *tuple; |
172 | struct masq_dev_work *w = ptr; |
173 | |
174 | if (!device_cmp(i: ct, arg: ptr)) |
175 | return 0; |
176 | |
177 | tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; |
178 | |
179 | return nf_inet_addr_cmp(a1: &w->addr, a2: &tuple->dst.u3); |
180 | } |
181 | |
182 | static int masq_inet_event(struct notifier_block *this, |
183 | unsigned long event, |
184 | void *ptr) |
185 | { |
186 | const struct in_ifaddr *ifa = ptr; |
187 | const struct in_device *idev; |
188 | const struct net_device *dev; |
189 | union nf_inet_addr addr; |
190 | |
191 | if (event != NETDEV_DOWN) |
192 | return NOTIFY_DONE; |
193 | |
194 | /* The masq_dev_notifier will catch the case of the device going |
195 | * down. So if the inetdev is dead and being destroyed we have |
196 | * no work to do. Otherwise this is an individual address removal |
197 | * and we have to perform the flush. |
198 | */ |
199 | idev = ifa->ifa_dev; |
200 | if (idev->dead) |
201 | return NOTIFY_DONE; |
202 | |
203 | memset(&addr, 0, sizeof(addr)); |
204 | |
205 | addr.ip = ifa->ifa_address; |
206 | |
207 | dev = idev->dev; |
208 | nf_nat_masq_schedule(net: dev_net(dev: idev->dev), addr: &addr, ifindex: dev->ifindex, |
209 | iter: inet_cmp, GFP_KERNEL); |
210 | |
211 | return NOTIFY_DONE; |
212 | } |
213 | |
214 | static struct notifier_block masq_dev_notifier = { |
215 | .notifier_call = masq_device_event, |
216 | }; |
217 | |
218 | static struct notifier_block masq_inet_notifier = { |
219 | .notifier_call = masq_inet_event, |
220 | }; |
221 | |
222 | #if IS_ENABLED(CONFIG_IPV6) |
223 | static int |
224 | nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, |
225 | const struct in6_addr *daddr, unsigned int srcprefs, |
226 | struct in6_addr *saddr) |
227 | { |
228 | #ifdef CONFIG_IPV6_MODULE |
229 | const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); |
230 | |
231 | if (!v6_ops) |
232 | return -EHOSTUNREACH; |
233 | |
234 | return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); |
235 | #else |
236 | return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); |
237 | #endif |
238 | } |
239 | |
240 | unsigned int |
241 | nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, |
242 | const struct net_device *out) |
243 | { |
244 | enum ip_conntrack_info ctinfo; |
245 | struct nf_conn_nat *nat; |
246 | struct in6_addr src; |
247 | struct nf_conn *ct; |
248 | struct nf_nat_range2 newrange; |
249 | |
250 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
251 | WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || |
252 | ctinfo == IP_CT_RELATED_REPLY))); |
253 | |
254 | if (nat_ipv6_dev_get_saddr(net: nf_ct_net(ct), dev: out, |
255 | daddr: &ipv6_hdr(skb)->daddr, srcprefs: 0, saddr: &src) < 0) |
256 | return NF_DROP; |
257 | |
258 | nat = nf_ct_nat_ext_add(ct); |
259 | if (nat) |
260 | nat->masq_index = out->ifindex; |
261 | |
262 | newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; |
263 | newrange.min_addr.in6 = src; |
264 | newrange.max_addr.in6 = src; |
265 | newrange.min_proto = range->min_proto; |
266 | newrange.max_proto = range->max_proto; |
267 | |
268 | return nf_nat_setup_info(ct, range: &newrange, maniptype: NF_NAT_MANIP_SRC); |
269 | } |
270 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); |
271 | |
272 | /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). |
273 | * |
274 | * Defer it to the system workqueue. |
275 | * |
276 | * As we can have 'a lot' of inet_events (depending on amount of ipv6 |
277 | * addresses being deleted), we also need to limit work item queue. |
278 | */ |
279 | static int masq_inet6_event(struct notifier_block *this, |
280 | unsigned long event, void *ptr) |
281 | { |
282 | struct inet6_ifaddr *ifa = ptr; |
283 | const struct net_device *dev; |
284 | union nf_inet_addr addr; |
285 | |
286 | if (event != NETDEV_DOWN) |
287 | return NOTIFY_DONE; |
288 | |
289 | dev = ifa->idev->dev; |
290 | |
291 | memset(&addr, 0, sizeof(addr)); |
292 | |
293 | addr.in6 = ifa->addr; |
294 | |
295 | nf_nat_masq_schedule(net: dev_net(dev), addr: &addr, ifindex: dev->ifindex, iter: inet_cmp, |
296 | GFP_ATOMIC); |
297 | return NOTIFY_DONE; |
298 | } |
299 | |
300 | static struct notifier_block masq_inet6_notifier = { |
301 | .notifier_call = masq_inet6_event, |
302 | }; |
303 | |
304 | static int nf_nat_masquerade_ipv6_register_notifier(void) |
305 | { |
306 | return register_inet6addr_notifier(nb: &masq_inet6_notifier); |
307 | } |
308 | #else |
309 | static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } |
310 | #endif |
311 | |
312 | int nf_nat_masquerade_inet_register_notifiers(void) |
313 | { |
314 | int ret = 0; |
315 | |
316 | mutex_lock(&masq_mutex); |
317 | if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { |
318 | ret = -EOVERFLOW; |
319 | goto out_unlock; |
320 | } |
321 | |
322 | /* check if the notifier was already set */ |
323 | if (++masq_refcnt > 1) |
324 | goto out_unlock; |
325 | |
326 | /* Register for device down reports */ |
327 | ret = register_netdevice_notifier(nb: &masq_dev_notifier); |
328 | if (ret) |
329 | goto err_dec; |
330 | /* Register IP address change reports */ |
331 | ret = register_inetaddr_notifier(nb: &masq_inet_notifier); |
332 | if (ret) |
333 | goto err_unregister; |
334 | |
335 | ret = nf_nat_masquerade_ipv6_register_notifier(); |
336 | if (ret) |
337 | goto err_unreg_inet; |
338 | |
339 | mutex_unlock(lock: &masq_mutex); |
340 | return ret; |
341 | err_unreg_inet: |
342 | unregister_inetaddr_notifier(nb: &masq_inet_notifier); |
343 | err_unregister: |
344 | unregister_netdevice_notifier(nb: &masq_dev_notifier); |
345 | err_dec: |
346 | masq_refcnt--; |
347 | out_unlock: |
348 | mutex_unlock(lock: &masq_mutex); |
349 | return ret; |
350 | } |
351 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); |
352 | |
353 | void nf_nat_masquerade_inet_unregister_notifiers(void) |
354 | { |
355 | mutex_lock(&masq_mutex); |
356 | /* check if the notifiers still have clients */ |
357 | if (--masq_refcnt > 0) |
358 | goto out_unlock; |
359 | |
360 | unregister_netdevice_notifier(nb: &masq_dev_notifier); |
361 | unregister_inetaddr_notifier(nb: &masq_inet_notifier); |
362 | #if IS_ENABLED(CONFIG_IPV6) |
363 | unregister_inet6addr_notifier(nb: &masq_inet6_notifier); |
364 | #endif |
365 | out_unlock: |
366 | mutex_unlock(lock: &masq_mutex); |
367 | } |
368 | EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers); |
369 | |