1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * (C) 1999-2001 Paul `Rusty' Russell |
4 | * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> |
5 | * (C) 2011 Patrick McHardy <kaber@trash.net> |
6 | */ |
7 | |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | |
10 | #include <linux/module.h> |
11 | #include <linux/types.h> |
12 | #include <linux/timer.h> |
13 | #include <linux/skbuff.h> |
14 | #include <linux/gfp.h> |
15 | #include <net/xfrm.h> |
16 | #include <linux/siphash.h> |
17 | #include <linux/rtnetlink.h> |
18 | |
19 | #include <net/netfilter/nf_conntrack_bpf.h> |
20 | #include <net/netfilter/nf_conntrack_core.h> |
21 | #include <net/netfilter/nf_conntrack_helper.h> |
22 | #include <net/netfilter/nf_conntrack_seqadj.h> |
23 | #include <net/netfilter/nf_conntrack_zones.h> |
24 | #include <net/netfilter/nf_nat.h> |
25 | #include <net/netfilter/nf_nat_helper.h> |
26 | #include <uapi/linux/netfilter/nf_nat.h> |
27 | |
28 | #include "nf_internals.h" |
29 | |
30 | #define NF_NAT_MAX_ATTEMPTS 128 |
31 | #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) |
32 | |
33 | static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; |
34 | |
35 | static DEFINE_MUTEX(nf_nat_proto_mutex); |
36 | static unsigned int nat_net_id __read_mostly; |
37 | |
38 | static struct hlist_head *nf_nat_bysource __read_mostly; |
39 | static unsigned int nf_nat_htable_size __read_mostly; |
40 | static siphash_aligned_key_t nf_nat_hash_rnd; |
41 | |
42 | struct nf_nat_lookup_hook_priv { |
43 | struct nf_hook_entries __rcu *entries; |
44 | |
45 | struct rcu_head rcu_head; |
46 | }; |
47 | |
48 | struct nf_nat_hooks_net { |
49 | struct nf_hook_ops *nat_hook_ops; |
50 | unsigned int users; |
51 | }; |
52 | |
53 | struct nat_net { |
54 | struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; |
55 | }; |
56 | |
57 | #ifdef CONFIG_XFRM |
58 | static void nf_nat_ipv4_decode_session(struct sk_buff *skb, |
59 | const struct nf_conn *ct, |
60 | enum ip_conntrack_dir dir, |
61 | unsigned long statusbit, |
62 | struct flowi *fl) |
63 | { |
64 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; |
65 | struct flowi4 *fl4 = &fl->u.ip4; |
66 | |
67 | if (ct->status & statusbit) { |
68 | fl4->daddr = t->dst.u3.ip; |
69 | if (t->dst.protonum == IPPROTO_TCP || |
70 | t->dst.protonum == IPPROTO_UDP || |
71 | t->dst.protonum == IPPROTO_UDPLITE || |
72 | t->dst.protonum == IPPROTO_DCCP || |
73 | t->dst.protonum == IPPROTO_SCTP) |
74 | fl4->fl4_dport = t->dst.u.all; |
75 | } |
76 | |
77 | statusbit ^= IPS_NAT_MASK; |
78 | |
79 | if (ct->status & statusbit) { |
80 | fl4->saddr = t->src.u3.ip; |
81 | if (t->dst.protonum == IPPROTO_TCP || |
82 | t->dst.protonum == IPPROTO_UDP || |
83 | t->dst.protonum == IPPROTO_UDPLITE || |
84 | t->dst.protonum == IPPROTO_DCCP || |
85 | t->dst.protonum == IPPROTO_SCTP) |
86 | fl4->fl4_sport = t->src.u.all; |
87 | } |
88 | } |
89 | |
90 | static void nf_nat_ipv6_decode_session(struct sk_buff *skb, |
91 | const struct nf_conn *ct, |
92 | enum ip_conntrack_dir dir, |
93 | unsigned long statusbit, |
94 | struct flowi *fl) |
95 | { |
96 | #if IS_ENABLED(CONFIG_IPV6) |
97 | const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; |
98 | struct flowi6 *fl6 = &fl->u.ip6; |
99 | |
100 | if (ct->status & statusbit) { |
101 | fl6->daddr = t->dst.u3.in6; |
102 | if (t->dst.protonum == IPPROTO_TCP || |
103 | t->dst.protonum == IPPROTO_UDP || |
104 | t->dst.protonum == IPPROTO_UDPLITE || |
105 | t->dst.protonum == IPPROTO_DCCP || |
106 | t->dst.protonum == IPPROTO_SCTP) |
107 | fl6->fl6_dport = t->dst.u.all; |
108 | } |
109 | |
110 | statusbit ^= IPS_NAT_MASK; |
111 | |
112 | if (ct->status & statusbit) { |
113 | fl6->saddr = t->src.u3.in6; |
114 | if (t->dst.protonum == IPPROTO_TCP || |
115 | t->dst.protonum == IPPROTO_UDP || |
116 | t->dst.protonum == IPPROTO_UDPLITE || |
117 | t->dst.protonum == IPPROTO_DCCP || |
118 | t->dst.protonum == IPPROTO_SCTP) |
119 | fl6->fl6_sport = t->src.u.all; |
120 | } |
121 | #endif |
122 | } |
123 | |
124 | static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) |
125 | { |
126 | const struct nf_conn *ct; |
127 | enum ip_conntrack_info ctinfo; |
128 | enum ip_conntrack_dir dir; |
129 | unsigned long statusbit; |
130 | u8 family; |
131 | |
132 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
133 | if (ct == NULL) |
134 | return; |
135 | |
136 | family = nf_ct_l3num(ct); |
137 | dir = CTINFO2DIR(ctinfo); |
138 | if (dir == IP_CT_DIR_ORIGINAL) |
139 | statusbit = IPS_DST_NAT; |
140 | else |
141 | statusbit = IPS_SRC_NAT; |
142 | |
143 | switch (family) { |
144 | case NFPROTO_IPV4: |
145 | nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); |
146 | return; |
147 | case NFPROTO_IPV6: |
148 | nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); |
149 | return; |
150 | } |
151 | } |
152 | #endif /* CONFIG_XFRM */ |
153 | |
154 | /* We keep an extra hash for each conntrack, for fast searching. */ |
155 | static unsigned int |
156 | hash_by_src(const struct net *net, |
157 | const struct nf_conntrack_zone *zone, |
158 | const struct nf_conntrack_tuple *tuple) |
159 | { |
160 | unsigned int hash; |
161 | struct { |
162 | struct nf_conntrack_man src; |
163 | u32 net_mix; |
164 | u32 protonum; |
165 | u32 zone; |
166 | } __aligned(SIPHASH_ALIGNMENT) combined; |
167 | |
168 | get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); |
169 | |
170 | memset(&combined, 0, sizeof(combined)); |
171 | |
172 | /* Original src, to ensure we map it consistently if poss. */ |
173 | combined.src = tuple->src; |
174 | combined.net_mix = net_hash_mix(net); |
175 | combined.protonum = tuple->dst.protonum; |
176 | |
177 | /* Zone ID can be used provided its valid for both directions */ |
178 | if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) |
179 | combined.zone = zone->id; |
180 | |
181 | hash = siphash(data: &combined, len: sizeof(combined), key: &nf_nat_hash_rnd); |
182 | |
183 | return reciprocal_scale(val: hash, ep_ro: nf_nat_htable_size); |
184 | } |
185 | |
186 | /* Is this tuple already taken? (not by us) */ |
187 | static int |
188 | nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, |
189 | const struct nf_conn *ignored_conntrack) |
190 | { |
191 | /* Conntrack tracking doesn't keep track of outgoing tuples; only |
192 | * incoming ones. NAT means they don't have a fixed mapping, |
193 | * so we invert the tuple and look for the incoming reply. |
194 | * |
195 | * We could keep a separate hash if this proves too slow. |
196 | */ |
197 | struct nf_conntrack_tuple reply; |
198 | |
199 | nf_ct_invert_tuple(inverse: &reply, orig: tuple); |
200 | return nf_conntrack_tuple_taken(tuple: &reply, ignored_conntrack); |
201 | } |
202 | |
203 | static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) |
204 | { |
205 | static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | |
206 | IPS_DYING; |
207 | static const unsigned long flags_needed = IPS_SRC_NAT; |
208 | enum tcp_conntrack old_state; |
209 | |
210 | old_state = READ_ONCE(ct->proto.tcp.state); |
211 | if (old_state < TCP_CONNTRACK_TIME_WAIT) |
212 | return false; |
213 | |
214 | if (flags & flags_refuse) |
215 | return false; |
216 | |
217 | return (flags & flags_needed) == flags_needed; |
218 | } |
219 | |
220 | /* reverse direction will send packets to new source, so |
221 | * make sure such packets are invalid. |
222 | */ |
223 | static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) |
224 | { |
225 | return (__s32)(new->proto.tcp.seen[0].td_end - |
226 | old->proto.tcp.seen[0].td_end) > 0; |
227 | } |
228 | |
229 | static int |
230 | nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, |
231 | const struct nf_conn *ignored_conntrack, |
232 | unsigned int attempts_left) |
233 | { |
234 | static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; |
235 | struct nf_conntrack_tuple_hash *thash; |
236 | const struct nf_conntrack_zone *zone; |
237 | struct nf_conntrack_tuple reply; |
238 | unsigned long flags; |
239 | struct nf_conn *ct; |
240 | bool taken = true; |
241 | struct net *net; |
242 | |
243 | nf_ct_invert_tuple(inverse: &reply, orig: tuple); |
244 | |
245 | if (attempts_left > NF_NAT_HARDER_THRESH || |
246 | tuple->dst.protonum != IPPROTO_TCP || |
247 | ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) |
248 | return nf_conntrack_tuple_taken(tuple: &reply, ignored_conntrack); |
249 | |
250 | /* :ast few attempts to find a free tcp port. Destructive |
251 | * action: evict colliding if its in timewait state and the |
252 | * tcp sequence number has advanced past the one used by the |
253 | * old entry. |
254 | */ |
255 | net = nf_ct_net(ct: ignored_conntrack); |
256 | zone = nf_ct_zone(ct: ignored_conntrack); |
257 | |
258 | thash = nf_conntrack_find_get(net, zone, tuple: &reply); |
259 | if (!thash) |
260 | return false; |
261 | |
262 | ct = nf_ct_tuplehash_to_ctrack(hash: thash); |
263 | |
264 | if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) |
265 | goto out; |
266 | |
267 | if (WARN_ON_ONCE(ct == ignored_conntrack)) |
268 | goto out; |
269 | |
270 | flags = READ_ONCE(ct->status); |
271 | if (!nf_nat_may_kill(ct, flags)) |
272 | goto out; |
273 | |
274 | if (!nf_seq_has_advanced(old: ct, new: ignored_conntrack)) |
275 | goto out; |
276 | |
277 | /* Even if we can evict do not reuse if entry is offloaded. */ |
278 | if (nf_ct_kill(ct)) |
279 | taken = flags & flags_offload; |
280 | out: |
281 | nf_ct_put(ct); |
282 | return taken; |
283 | } |
284 | |
285 | static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, |
286 | const struct nf_nat_range2 *range) |
287 | { |
288 | if (t->src.l3num == NFPROTO_IPV4) |
289 | return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && |
290 | ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); |
291 | |
292 | return ipv6_addr_cmp(a1: &t->src.u3.in6, a2: &range->min_addr.in6) >= 0 && |
293 | ipv6_addr_cmp(a1: &t->src.u3.in6, a2: &range->max_addr.in6) <= 0; |
294 | } |
295 | |
296 | /* Is the manipable part of the tuple between min and max incl? */ |
297 | static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, |
298 | enum nf_nat_manip_type maniptype, |
299 | const union nf_conntrack_man_proto *min, |
300 | const union nf_conntrack_man_proto *max) |
301 | { |
302 | __be16 port; |
303 | |
304 | switch (tuple->dst.protonum) { |
305 | case IPPROTO_ICMP: |
306 | case IPPROTO_ICMPV6: |
307 | return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && |
308 | ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); |
309 | case IPPROTO_GRE: /* all fall though */ |
310 | case IPPROTO_TCP: |
311 | case IPPROTO_UDP: |
312 | case IPPROTO_UDPLITE: |
313 | case IPPROTO_DCCP: |
314 | case IPPROTO_SCTP: |
315 | if (maniptype == NF_NAT_MANIP_SRC) |
316 | port = tuple->src.u.all; |
317 | else |
318 | port = tuple->dst.u.all; |
319 | |
320 | return ntohs(port) >= ntohs(min->all) && |
321 | ntohs(port) <= ntohs(max->all); |
322 | default: |
323 | return true; |
324 | } |
325 | } |
326 | |
327 | /* If we source map this tuple so reply looks like reply_tuple, will |
328 | * that meet the constraints of range. |
329 | */ |
330 | static int nf_in_range(const struct nf_conntrack_tuple *tuple, |
331 | const struct nf_nat_range2 *range) |
332 | { |
333 | /* If we are supposed to map IPs, then we must be in the |
334 | * range specified, otherwise let this drag us onto a new src IP. |
335 | */ |
336 | if (range->flags & NF_NAT_RANGE_MAP_IPS && |
337 | !nf_nat_inet_in_range(t: tuple, range)) |
338 | return 0; |
339 | |
340 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) |
341 | return 1; |
342 | |
343 | return l4proto_in_range(tuple, maniptype: NF_NAT_MANIP_SRC, |
344 | min: &range->min_proto, max: &range->max_proto); |
345 | } |
346 | |
347 | static inline int |
348 | same_src(const struct nf_conn *ct, |
349 | const struct nf_conntrack_tuple *tuple) |
350 | { |
351 | const struct nf_conntrack_tuple *t; |
352 | |
353 | t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; |
354 | return (t->dst.protonum == tuple->dst.protonum && |
355 | nf_inet_addr_cmp(a1: &t->src.u3, a2: &tuple->src.u3) && |
356 | t->src.u.all == tuple->src.u.all); |
357 | } |
358 | |
359 | /* Only called for SRC manip */ |
360 | static int |
361 | find_appropriate_src(struct net *net, |
362 | const struct nf_conntrack_zone *zone, |
363 | const struct nf_conntrack_tuple *tuple, |
364 | struct nf_conntrack_tuple *result, |
365 | const struct nf_nat_range2 *range) |
366 | { |
367 | unsigned int h = hash_by_src(net, zone, tuple); |
368 | const struct nf_conn *ct; |
369 | |
370 | hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { |
371 | if (same_src(ct, tuple) && |
372 | net_eq(net1: net, net2: nf_ct_net(ct)) && |
373 | nf_ct_zone_equal(a: ct, b: zone, dir: IP_CT_DIR_ORIGINAL)) { |
374 | /* Copy source part from reply tuple. */ |
375 | nf_ct_invert_tuple(inverse: result, |
376 | orig: &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
377 | result->dst = tuple->dst; |
378 | |
379 | if (nf_in_range(tuple: result, range)) |
380 | return 1; |
381 | } |
382 | } |
383 | return 0; |
384 | } |
385 | |
386 | /* For [FUTURE] fragmentation handling, we want the least-used |
387 | * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus |
388 | * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports |
389 | * 1-65535, we don't do pro-rata allocation based on ports; we choose |
390 | * the ip with the lowest src-ip/dst-ip/proto usage. |
391 | */ |
392 | static void |
393 | find_best_ips_proto(const struct nf_conntrack_zone *zone, |
394 | struct nf_conntrack_tuple *tuple, |
395 | const struct nf_nat_range2 *range, |
396 | const struct nf_conn *ct, |
397 | enum nf_nat_manip_type maniptype) |
398 | { |
399 | union nf_inet_addr *var_ipp; |
400 | unsigned int i, max; |
401 | /* Host order */ |
402 | u32 minip, maxip, j, dist; |
403 | bool full_range; |
404 | |
405 | /* No IP mapping? Do nothing. */ |
406 | if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) |
407 | return; |
408 | |
409 | if (maniptype == NF_NAT_MANIP_SRC) |
410 | var_ipp = &tuple->src.u3; |
411 | else |
412 | var_ipp = &tuple->dst.u3; |
413 | |
414 | /* Fast path: only one choice. */ |
415 | if (nf_inet_addr_cmp(a1: &range->min_addr, a2: &range->max_addr)) { |
416 | *var_ipp = range->min_addr; |
417 | return; |
418 | } |
419 | |
420 | if (nf_ct_l3num(ct) == NFPROTO_IPV4) |
421 | max = sizeof(var_ipp->ip) / sizeof(u32) - 1; |
422 | else |
423 | max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; |
424 | |
425 | /* Hashing source and destination IPs gives a fairly even |
426 | * spread in practice (if there are a small number of IPs |
427 | * involved, there usually aren't that many connections |
428 | * anyway). The consistency means that servers see the same |
429 | * client coming from the same IP (some Internet Banking sites |
430 | * like this), even across reboots. |
431 | */ |
432 | j = jhash2(k: (u32 *)&tuple->src.u3, length: sizeof(tuple->src.u3) / sizeof(u32), |
433 | initval: range->flags & NF_NAT_RANGE_PERSISTENT ? |
434 | 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); |
435 | |
436 | full_range = false; |
437 | for (i = 0; i <= max; i++) { |
438 | /* If first bytes of the address are at the maximum, use the |
439 | * distance. Otherwise use the full range. |
440 | */ |
441 | if (!full_range) { |
442 | minip = ntohl((__force __be32)range->min_addr.all[i]); |
443 | maxip = ntohl((__force __be32)range->max_addr.all[i]); |
444 | dist = maxip - minip + 1; |
445 | } else { |
446 | minip = 0; |
447 | dist = ~0; |
448 | } |
449 | |
450 | var_ipp->all[i] = (__force __u32) |
451 | htonl(minip + reciprocal_scale(j, dist)); |
452 | if (var_ipp->all[i] != range->max_addr.all[i]) |
453 | full_range = true; |
454 | |
455 | if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) |
456 | j ^= (__force u32)tuple->dst.u3.all[i]; |
457 | } |
458 | } |
459 | |
460 | /* Alter the per-proto part of the tuple (depending on maniptype), to |
461 | * give a unique tuple in the given range if possible. |
462 | * |
463 | * Per-protocol part of tuple is initialized to the incoming packet. |
464 | */ |
465 | static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, |
466 | const struct nf_nat_range2 *range, |
467 | enum nf_nat_manip_type maniptype, |
468 | const struct nf_conn *ct) |
469 | { |
470 | unsigned int range_size, min, max, i, attempts; |
471 | __be16 *keyptr; |
472 | u16 off; |
473 | |
474 | switch (tuple->dst.protonum) { |
475 | case IPPROTO_ICMP: |
476 | case IPPROTO_ICMPV6: |
477 | /* id is same for either direction... */ |
478 | keyptr = &tuple->src.u.icmp.id; |
479 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
480 | min = 0; |
481 | range_size = 65536; |
482 | } else { |
483 | min = ntohs(range->min_proto.icmp.id); |
484 | range_size = ntohs(range->max_proto.icmp.id) - |
485 | ntohs(range->min_proto.icmp.id) + 1; |
486 | } |
487 | goto find_free_id; |
488 | #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) |
489 | case IPPROTO_GRE: |
490 | /* If there is no master conntrack we are not PPTP, |
491 | do not change tuples */ |
492 | if (!ct->master) |
493 | return; |
494 | |
495 | if (maniptype == NF_NAT_MANIP_SRC) |
496 | keyptr = &tuple->src.u.gre.key; |
497 | else |
498 | keyptr = &tuple->dst.u.gre.key; |
499 | |
500 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
501 | min = 1; |
502 | range_size = 65535; |
503 | } else { |
504 | min = ntohs(range->min_proto.gre.key); |
505 | range_size = ntohs(range->max_proto.gre.key) - min + 1; |
506 | } |
507 | goto find_free_id; |
508 | #endif |
509 | case IPPROTO_UDP: |
510 | case IPPROTO_UDPLITE: |
511 | case IPPROTO_TCP: |
512 | case IPPROTO_SCTP: |
513 | case IPPROTO_DCCP: |
514 | if (maniptype == NF_NAT_MANIP_SRC) |
515 | keyptr = &tuple->src.u.all; |
516 | else |
517 | keyptr = &tuple->dst.u.all; |
518 | |
519 | break; |
520 | default: |
521 | return; |
522 | } |
523 | |
524 | /* If no range specified... */ |
525 | if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { |
526 | /* If it's dst rewrite, can't change port */ |
527 | if (maniptype == NF_NAT_MANIP_DST) |
528 | return; |
529 | |
530 | if (ntohs(*keyptr) < 1024) { |
531 | /* Loose convention: >> 512 is credential passing */ |
532 | if (ntohs(*keyptr) < 512) { |
533 | min = 1; |
534 | range_size = 511 - min + 1; |
535 | } else { |
536 | min = 600; |
537 | range_size = 1023 - min + 1; |
538 | } |
539 | } else { |
540 | min = 1024; |
541 | range_size = 65535 - 1024 + 1; |
542 | } |
543 | } else { |
544 | min = ntohs(range->min_proto.all); |
545 | max = ntohs(range->max_proto.all); |
546 | if (unlikely(max < min)) |
547 | swap(max, min); |
548 | range_size = max - min + 1; |
549 | } |
550 | |
551 | find_free_id: |
552 | if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) |
553 | off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); |
554 | else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || |
555 | maniptype != NF_NAT_MANIP_DST) |
556 | off = get_random_u16(); |
557 | else |
558 | off = 0; |
559 | |
560 | attempts = range_size; |
561 | if (attempts > NF_NAT_MAX_ATTEMPTS) |
562 | attempts = NF_NAT_MAX_ATTEMPTS; |
563 | |
564 | /* We are in softirq; doing a search of the entire range risks |
565 | * soft lockup when all tuples are already used. |
566 | * |
567 | * If we can't find any free port from first offset, pick a new |
568 | * one and try again, with ever smaller search window. |
569 | */ |
570 | another_round: |
571 | for (i = 0; i < attempts; i++, off++) { |
572 | *keyptr = htons(min + off % range_size); |
573 | if (!nf_nat_used_tuple_harder(tuple, ignored_conntrack: ct, attempts_left: attempts - i)) |
574 | return; |
575 | } |
576 | |
577 | if (attempts >= range_size || attempts < 16) |
578 | return; |
579 | attempts /= 2; |
580 | off = get_random_u16(); |
581 | goto another_round; |
582 | } |
583 | |
584 | /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, |
585 | * we change the source to map into the range. For NF_INET_PRE_ROUTING |
586 | * and NF_INET_LOCAL_OUT, we change the destination to map into the |
587 | * range. It might not be possible to get a unique tuple, but we try. |
588 | * At worst (or if we race), we will end up with a final duplicate in |
589 | * __nf_conntrack_confirm and drop the packet. */ |
590 | static void |
591 | get_unique_tuple(struct nf_conntrack_tuple *tuple, |
592 | const struct nf_conntrack_tuple *orig_tuple, |
593 | const struct nf_nat_range2 *range, |
594 | struct nf_conn *ct, |
595 | enum nf_nat_manip_type maniptype) |
596 | { |
597 | const struct nf_conntrack_zone *zone; |
598 | struct net *net = nf_ct_net(ct); |
599 | |
600 | zone = nf_ct_zone(ct); |
601 | |
602 | /* 1) If this srcip/proto/src-proto-part is currently mapped, |
603 | * and that same mapping gives a unique tuple within the given |
604 | * range, use that. |
605 | * |
606 | * This is only required for source (ie. NAT/masq) mappings. |
607 | * So far, we don't do local source mappings, so multiple |
608 | * manips not an issue. |
609 | */ |
610 | if (maniptype == NF_NAT_MANIP_SRC && |
611 | !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { |
612 | /* try the original tuple first */ |
613 | if (nf_in_range(tuple: orig_tuple, range)) { |
614 | if (!nf_nat_used_tuple(tuple: orig_tuple, ignored_conntrack: ct)) { |
615 | *tuple = *orig_tuple; |
616 | return; |
617 | } |
618 | } else if (find_appropriate_src(net, zone, |
619 | tuple: orig_tuple, result: tuple, range)) { |
620 | pr_debug("get_unique_tuple: Found current src map\n" ); |
621 | if (!nf_nat_used_tuple(tuple, ignored_conntrack: ct)) |
622 | return; |
623 | } |
624 | } |
625 | |
626 | /* 2) Select the least-used IP/proto combination in the given range */ |
627 | *tuple = *orig_tuple; |
628 | find_best_ips_proto(zone, tuple, range, ct, maniptype); |
629 | |
630 | /* 3) The per-protocol part of the manip is made to map into |
631 | * the range to make a unique tuple. |
632 | */ |
633 | |
634 | /* Only bother mapping if it's not already in range and unique */ |
635 | if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { |
636 | if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { |
637 | if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && |
638 | l4proto_in_range(tuple, maniptype, |
639 | min: &range->min_proto, |
640 | max: &range->max_proto) && |
641 | (range->min_proto.all == range->max_proto.all || |
642 | !nf_nat_used_tuple(tuple, ignored_conntrack: ct))) |
643 | return; |
644 | } else if (!nf_nat_used_tuple(tuple, ignored_conntrack: ct)) { |
645 | return; |
646 | } |
647 | } |
648 | |
649 | /* Last chance: get protocol to try to obtain unique tuple. */ |
650 | nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); |
651 | } |
652 | |
653 | struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) |
654 | { |
655 | struct nf_conn_nat *nat = nfct_nat(ct); |
656 | if (nat) |
657 | return nat; |
658 | |
659 | if (!nf_ct_is_confirmed(ct)) |
660 | nat = nf_ct_ext_add(ct, id: NF_CT_EXT_NAT, GFP_ATOMIC); |
661 | |
662 | return nat; |
663 | } |
664 | EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); |
665 | |
666 | unsigned int |
667 | nf_nat_setup_info(struct nf_conn *ct, |
668 | const struct nf_nat_range2 *range, |
669 | enum nf_nat_manip_type maniptype) |
670 | { |
671 | struct net *net = nf_ct_net(ct); |
672 | struct nf_conntrack_tuple curr_tuple, new_tuple; |
673 | |
674 | /* Can't setup nat info for confirmed ct. */ |
675 | if (nf_ct_is_confirmed(ct)) |
676 | return NF_ACCEPT; |
677 | |
678 | WARN_ON(maniptype != NF_NAT_MANIP_SRC && |
679 | maniptype != NF_NAT_MANIP_DST); |
680 | |
681 | if (WARN_ON(nf_nat_initialized(ct, maniptype))) |
682 | return NF_DROP; |
683 | |
684 | /* What we've got will look like inverse of reply. Normally |
685 | * this is what is in the conntrack, except for prior |
686 | * manipulations (future optimization: if num_manips == 0, |
687 | * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) |
688 | */ |
689 | nf_ct_invert_tuple(inverse: &curr_tuple, |
690 | orig: &ct->tuplehash[IP_CT_DIR_REPLY].tuple); |
691 | |
692 | get_unique_tuple(tuple: &new_tuple, orig_tuple: &curr_tuple, range, ct, maniptype); |
693 | |
694 | if (!nf_ct_tuple_equal(t1: &new_tuple, t2: &curr_tuple)) { |
695 | struct nf_conntrack_tuple reply; |
696 | |
697 | /* Alter conntrack table so will recognize replies. */ |
698 | nf_ct_invert_tuple(inverse: &reply, orig: &new_tuple); |
699 | nf_conntrack_alter_reply(ct, newreply: &reply); |
700 | |
701 | /* Non-atomic: we own this at the moment. */ |
702 | if (maniptype == NF_NAT_MANIP_SRC) |
703 | ct->status |= IPS_SRC_NAT; |
704 | else |
705 | ct->status |= IPS_DST_NAT; |
706 | |
707 | if (nfct_help(ct) && !nfct_seqadj(ct)) |
708 | if (!nfct_seqadj_ext_add(ct)) |
709 | return NF_DROP; |
710 | } |
711 | |
712 | if (maniptype == NF_NAT_MANIP_SRC) { |
713 | unsigned int srchash; |
714 | spinlock_t *lock; |
715 | |
716 | srchash = hash_by_src(net, zone: nf_ct_zone(ct), |
717 | tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
718 | lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; |
719 | spin_lock_bh(lock); |
720 | hlist_add_head_rcu(n: &ct->nat_bysource, |
721 | h: &nf_nat_bysource[srchash]); |
722 | spin_unlock_bh(lock); |
723 | } |
724 | |
725 | /* It's done. */ |
726 | if (maniptype == NF_NAT_MANIP_DST) |
727 | ct->status |= IPS_DST_NAT_DONE; |
728 | else |
729 | ct->status |= IPS_SRC_NAT_DONE; |
730 | |
731 | return NF_ACCEPT; |
732 | } |
733 | EXPORT_SYMBOL(nf_nat_setup_info); |
734 | |
735 | static unsigned int |
736 | __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) |
737 | { |
738 | /* Force range to this IP; let proto decide mapping for |
739 | * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). |
740 | * Use reply in case it's already been mangled (eg local packet). |
741 | */ |
742 | union nf_inet_addr ip = |
743 | (manip == NF_NAT_MANIP_SRC ? |
744 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : |
745 | ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); |
746 | struct nf_nat_range2 range = { |
747 | .flags = NF_NAT_RANGE_MAP_IPS, |
748 | .min_addr = ip, |
749 | .max_addr = ip, |
750 | }; |
751 | return nf_nat_setup_info(ct, &range, manip); |
752 | } |
753 | |
754 | unsigned int |
755 | nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) |
756 | { |
757 | return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); |
758 | } |
759 | EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); |
760 | |
761 | /* Do packet manipulations according to nf_nat_setup_info. */ |
762 | unsigned int nf_nat_packet(struct nf_conn *ct, |
763 | enum ip_conntrack_info ctinfo, |
764 | unsigned int hooknum, |
765 | struct sk_buff *skb) |
766 | { |
767 | enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); |
768 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); |
769 | unsigned int verdict = NF_ACCEPT; |
770 | unsigned long statusbit; |
771 | |
772 | if (mtype == NF_NAT_MANIP_SRC) |
773 | statusbit = IPS_SRC_NAT; |
774 | else |
775 | statusbit = IPS_DST_NAT; |
776 | |
777 | /* Invert if this is reply dir. */ |
778 | if (dir == IP_CT_DIR_REPLY) |
779 | statusbit ^= IPS_NAT_MASK; |
780 | |
781 | /* Non-atomic: these bits don't change. */ |
782 | if (ct->status & statusbit) |
783 | verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); |
784 | |
785 | return verdict; |
786 | } |
787 | EXPORT_SYMBOL_GPL(nf_nat_packet); |
788 | |
789 | static bool in_vrf_postrouting(const struct nf_hook_state *state) |
790 | { |
791 | #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) |
792 | if (state->hook == NF_INET_POST_ROUTING && |
793 | netif_is_l3_master(dev: state->out)) |
794 | return true; |
795 | #endif |
796 | return false; |
797 | } |
798 | |
799 | unsigned int |
800 | nf_nat_inet_fn(void *priv, struct sk_buff *skb, |
801 | const struct nf_hook_state *state) |
802 | { |
803 | struct nf_conn *ct; |
804 | enum ip_conntrack_info ctinfo; |
805 | struct nf_conn_nat *nat; |
806 | /* maniptype == SRC for postrouting. */ |
807 | enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); |
808 | |
809 | ct = nf_ct_get(skb, ctinfo: &ctinfo); |
810 | /* Can't track? It's not due to stress, or conntrack would |
811 | * have dropped it. Hence it's the user's responsibilty to |
812 | * packet filter it out, or implement conntrack/NAT for that |
813 | * protocol. 8) --RR |
814 | */ |
815 | if (!ct || in_vrf_postrouting(state)) |
816 | return NF_ACCEPT; |
817 | |
818 | nat = nfct_nat(ct); |
819 | |
820 | switch (ctinfo) { |
821 | case IP_CT_RELATED: |
822 | case IP_CT_RELATED_REPLY: |
823 | /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ |
824 | case IP_CT_NEW: |
825 | /* Seen it before? This can happen for loopback, retrans, |
826 | * or local packets. |
827 | */ |
828 | if (!nf_nat_initialized(ct, manip: maniptype)) { |
829 | struct nf_nat_lookup_hook_priv *lpriv = priv; |
830 | struct nf_hook_entries *e = rcu_dereference(lpriv->entries); |
831 | unsigned int ret; |
832 | int i; |
833 | |
834 | if (!e) |
835 | goto null_bind; |
836 | |
837 | for (i = 0; i < e->num_hook_entries; i++) { |
838 | ret = e->hooks[i].hook(e->hooks[i].priv, skb, |
839 | state); |
840 | if (ret != NF_ACCEPT) |
841 | return ret; |
842 | if (nf_nat_initialized(ct, manip: maniptype)) |
843 | goto do_nat; |
844 | } |
845 | null_bind: |
846 | ret = nf_nat_alloc_null_binding(ct, state->hook); |
847 | if (ret != NF_ACCEPT) |
848 | return ret; |
849 | } else { |
850 | pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n" , |
851 | maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST" , |
852 | ct, ct->status); |
853 | if (nf_nat_oif_changed(hooknum: state->hook, ctinfo, nat, |
854 | out: state->out)) |
855 | goto oif_changed; |
856 | } |
857 | break; |
858 | default: |
859 | /* ESTABLISHED */ |
860 | WARN_ON(ctinfo != IP_CT_ESTABLISHED && |
861 | ctinfo != IP_CT_ESTABLISHED_REPLY); |
862 | if (nf_nat_oif_changed(hooknum: state->hook, ctinfo, nat, out: state->out)) |
863 | goto oif_changed; |
864 | } |
865 | do_nat: |
866 | return nf_nat_packet(ct, ctinfo, state->hook, skb); |
867 | |
868 | oif_changed: |
869 | nf_ct_kill_acct(ct, ctinfo, skb); |
870 | return NF_DROP; |
871 | } |
872 | EXPORT_SYMBOL_GPL(nf_nat_inet_fn); |
873 | |
874 | struct nf_nat_proto_clean { |
875 | u8 l3proto; |
876 | u8 l4proto; |
877 | }; |
878 | |
879 | /* kill conntracks with affected NAT section */ |
880 | static int nf_nat_proto_remove(struct nf_conn *i, void *data) |
881 | { |
882 | const struct nf_nat_proto_clean *clean = data; |
883 | |
884 | if ((clean->l3proto && nf_ct_l3num(ct: i) != clean->l3proto) || |
885 | (clean->l4proto && nf_ct_protonum(ct: i) != clean->l4proto)) |
886 | return 0; |
887 | |
888 | return i->status & IPS_NAT_MASK ? 1 : 0; |
889 | } |
890 | |
891 | static void nf_nat_cleanup_conntrack(struct nf_conn *ct) |
892 | { |
893 | unsigned int h; |
894 | |
895 | h = hash_by_src(net: nf_ct_net(ct), zone: nf_ct_zone(ct), tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); |
896 | spin_lock_bh(lock: &nf_nat_locks[h % CONNTRACK_LOCKS]); |
897 | hlist_del_rcu(n: &ct->nat_bysource); |
898 | spin_unlock_bh(lock: &nf_nat_locks[h % CONNTRACK_LOCKS]); |
899 | } |
900 | |
901 | static int nf_nat_proto_clean(struct nf_conn *ct, void *data) |
902 | { |
903 | if (nf_nat_proto_remove(i: ct, data)) |
904 | return 1; |
905 | |
906 | /* This module is being removed and conntrack has nat null binding. |
907 | * Remove it from bysource hash, as the table will be freed soon. |
908 | * |
909 | * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() |
910 | * will delete entry from already-freed table. |
911 | */ |
912 | if (test_and_clear_bit(nr: IPS_SRC_NAT_DONE_BIT, addr: &ct->status)) |
913 | nf_nat_cleanup_conntrack(ct); |
914 | |
915 | /* don't delete conntrack. Although that would make things a lot |
916 | * simpler, we'd end up flushing all conntracks on nat rmmod. |
917 | */ |
918 | return 0; |
919 | } |
920 | |
921 | #if IS_ENABLED(CONFIG_NF_CT_NETLINK) |
922 | |
923 | #include <linux/netfilter/nfnetlink.h> |
924 | #include <linux/netfilter/nfnetlink_conntrack.h> |
925 | |
926 | static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { |
927 | [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, |
928 | [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, |
929 | }; |
930 | |
931 | static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], |
932 | struct nf_nat_range2 *range) |
933 | { |
934 | if (tb[CTA_PROTONAT_PORT_MIN]) { |
935 | range->min_proto.all = nla_get_be16(nla: tb[CTA_PROTONAT_PORT_MIN]); |
936 | range->max_proto.all = range->min_proto.all; |
937 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; |
938 | } |
939 | if (tb[CTA_PROTONAT_PORT_MAX]) { |
940 | range->max_proto.all = nla_get_be16(nla: tb[CTA_PROTONAT_PORT_MAX]); |
941 | range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; |
942 | } |
943 | return 0; |
944 | } |
945 | |
946 | static int nfnetlink_parse_nat_proto(struct nlattr *attr, |
947 | const struct nf_conn *ct, |
948 | struct nf_nat_range2 *range) |
949 | { |
950 | struct nlattr *tb[CTA_PROTONAT_MAX+1]; |
951 | int err; |
952 | |
953 | err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, nla: attr, |
954 | policy: protonat_nla_policy, NULL); |
955 | if (err < 0) |
956 | return err; |
957 | |
958 | return nf_nat_l4proto_nlattr_to_range(tb, range); |
959 | } |
960 | |
961 | static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { |
962 | [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, |
963 | [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, |
964 | [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, |
965 | [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, |
966 | [CTA_NAT_PROTO] = { .type = NLA_NESTED }, |
967 | }; |
968 | |
969 | static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], |
970 | struct nf_nat_range2 *range) |
971 | { |
972 | if (tb[CTA_NAT_V4_MINIP]) { |
973 | range->min_addr.ip = nla_get_be32(nla: tb[CTA_NAT_V4_MINIP]); |
974 | range->flags |= NF_NAT_RANGE_MAP_IPS; |
975 | } |
976 | |
977 | if (tb[CTA_NAT_V4_MAXIP]) |
978 | range->max_addr.ip = nla_get_be32(nla: tb[CTA_NAT_V4_MAXIP]); |
979 | else |
980 | range->max_addr.ip = range->min_addr.ip; |
981 | |
982 | return 0; |
983 | } |
984 | |
985 | static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], |
986 | struct nf_nat_range2 *range) |
987 | { |
988 | if (tb[CTA_NAT_V6_MINIP]) { |
989 | nla_memcpy(dest: &range->min_addr.ip6, src: tb[CTA_NAT_V6_MINIP], |
990 | count: sizeof(struct in6_addr)); |
991 | range->flags |= NF_NAT_RANGE_MAP_IPS; |
992 | } |
993 | |
994 | if (tb[CTA_NAT_V6_MAXIP]) |
995 | nla_memcpy(dest: &range->max_addr.ip6, src: tb[CTA_NAT_V6_MAXIP], |
996 | count: sizeof(struct in6_addr)); |
997 | else |
998 | range->max_addr = range->min_addr; |
999 | |
1000 | return 0; |
1001 | } |
1002 | |
1003 | static int |
1004 | nfnetlink_parse_nat(const struct nlattr *nat, |
1005 | const struct nf_conn *ct, struct nf_nat_range2 *range) |
1006 | { |
1007 | struct nlattr *tb[CTA_NAT_MAX+1]; |
1008 | int err; |
1009 | |
1010 | memset(range, 0, sizeof(*range)); |
1011 | |
1012 | err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nla: nat, |
1013 | policy: nat_nla_policy, NULL); |
1014 | if (err < 0) |
1015 | return err; |
1016 | |
1017 | switch (nf_ct_l3num(ct)) { |
1018 | case NFPROTO_IPV4: |
1019 | err = nf_nat_ipv4_nlattr_to_range(tb, range); |
1020 | break; |
1021 | case NFPROTO_IPV6: |
1022 | err = nf_nat_ipv6_nlattr_to_range(tb, range); |
1023 | break; |
1024 | default: |
1025 | err = -EPROTONOSUPPORT; |
1026 | break; |
1027 | } |
1028 | |
1029 | if (err) |
1030 | return err; |
1031 | |
1032 | if (!tb[CTA_NAT_PROTO]) |
1033 | return 0; |
1034 | |
1035 | return nfnetlink_parse_nat_proto(attr: tb[CTA_NAT_PROTO], ct, range); |
1036 | } |
1037 | |
1038 | /* This function is called under rcu_read_lock() */ |
1039 | static int |
1040 | nfnetlink_parse_nat_setup(struct nf_conn *ct, |
1041 | enum nf_nat_manip_type manip, |
1042 | const struct nlattr *attr) |
1043 | { |
1044 | struct nf_nat_range2 range; |
1045 | int err; |
1046 | |
1047 | /* Should not happen, restricted to creating new conntracks |
1048 | * via ctnetlink. |
1049 | */ |
1050 | if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) |
1051 | return -EEXIST; |
1052 | |
1053 | /* No NAT information has been passed, allocate the null-binding */ |
1054 | if (attr == NULL) |
1055 | return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; |
1056 | |
1057 | err = nfnetlink_parse_nat(nat: attr, ct, range: &range); |
1058 | if (err < 0) |
1059 | return err; |
1060 | |
1061 | return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; |
1062 | } |
1063 | #else |
1064 | static int |
1065 | nfnetlink_parse_nat_setup(struct nf_conn *ct, |
1066 | enum nf_nat_manip_type manip, |
1067 | const struct nlattr *attr) |
1068 | { |
1069 | return -EOPNOTSUPP; |
1070 | } |
1071 | #endif |
1072 | |
1073 | static struct nf_ct_helper_expectfn follow_master_nat = { |
1074 | .name = "nat-follow-master" , |
1075 | .expectfn = nf_nat_follow_master, |
1076 | }; |
1077 | |
1078 | int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1079 | const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) |
1080 | { |
1081 | struct nat_net *nat_net = net_generic(net, id: nat_net_id); |
1082 | struct nf_nat_hooks_net *nat_proto_net; |
1083 | struct nf_nat_lookup_hook_priv *priv; |
1084 | unsigned int hooknum = ops->hooknum; |
1085 | struct nf_hook_ops *nat_ops; |
1086 | int i, ret; |
1087 | |
1088 | if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) |
1089 | return -EINVAL; |
1090 | |
1091 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1092 | |
1093 | for (i = 0; i < ops_count; i++) { |
1094 | if (orig_nat_ops[i].hooknum == hooknum) { |
1095 | hooknum = i; |
1096 | break; |
1097 | } |
1098 | } |
1099 | |
1100 | if (WARN_ON_ONCE(i == ops_count)) |
1101 | return -EINVAL; |
1102 | |
1103 | mutex_lock(&nf_nat_proto_mutex); |
1104 | if (!nat_proto_net->nat_hook_ops) { |
1105 | WARN_ON(nat_proto_net->users != 0); |
1106 | |
1107 | nat_ops = kmemdup(p: orig_nat_ops, size: sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); |
1108 | if (!nat_ops) { |
1109 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1110 | return -ENOMEM; |
1111 | } |
1112 | |
1113 | for (i = 0; i < ops_count; i++) { |
1114 | priv = kzalloc(size: sizeof(*priv), GFP_KERNEL); |
1115 | if (priv) { |
1116 | nat_ops[i].priv = priv; |
1117 | continue; |
1118 | } |
1119 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1120 | while (i) |
1121 | kfree(objp: nat_ops[--i].priv); |
1122 | kfree(objp: nat_ops); |
1123 | return -ENOMEM; |
1124 | } |
1125 | |
1126 | ret = nf_register_net_hooks(net, reg: nat_ops, n: ops_count); |
1127 | if (ret < 0) { |
1128 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1129 | for (i = 0; i < ops_count; i++) |
1130 | kfree(objp: nat_ops[i].priv); |
1131 | kfree(objp: nat_ops); |
1132 | return ret; |
1133 | } |
1134 | |
1135 | nat_proto_net->nat_hook_ops = nat_ops; |
1136 | } |
1137 | |
1138 | nat_ops = nat_proto_net->nat_hook_ops; |
1139 | priv = nat_ops[hooknum].priv; |
1140 | if (WARN_ON_ONCE(!priv)) { |
1141 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1142 | return -EOPNOTSUPP; |
1143 | } |
1144 | |
1145 | ret = nf_hook_entries_insert_raw(pp: &priv->entries, reg: ops); |
1146 | if (ret == 0) |
1147 | nat_proto_net->users++; |
1148 | |
1149 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1150 | return ret; |
1151 | } |
1152 | |
1153 | void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, |
1154 | unsigned int ops_count) |
1155 | { |
1156 | struct nat_net *nat_net = net_generic(net, id: nat_net_id); |
1157 | struct nf_nat_hooks_net *nat_proto_net; |
1158 | struct nf_nat_lookup_hook_priv *priv; |
1159 | struct nf_hook_ops *nat_ops; |
1160 | int hooknum = ops->hooknum; |
1161 | int i; |
1162 | |
1163 | if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) |
1164 | return; |
1165 | |
1166 | nat_proto_net = &nat_net->nat_proto_net[pf]; |
1167 | |
1168 | mutex_lock(&nf_nat_proto_mutex); |
1169 | if (WARN_ON(nat_proto_net->users == 0)) |
1170 | goto unlock; |
1171 | |
1172 | nat_proto_net->users--; |
1173 | |
1174 | nat_ops = nat_proto_net->nat_hook_ops; |
1175 | for (i = 0; i < ops_count; i++) { |
1176 | if (nat_ops[i].hooknum == hooknum) { |
1177 | hooknum = i; |
1178 | break; |
1179 | } |
1180 | } |
1181 | if (WARN_ON_ONCE(i == ops_count)) |
1182 | goto unlock; |
1183 | priv = nat_ops[hooknum].priv; |
1184 | nf_hook_entries_delete_raw(pp: &priv->entries, reg: ops); |
1185 | |
1186 | if (nat_proto_net->users == 0) { |
1187 | nf_unregister_net_hooks(net, reg: nat_ops, n: ops_count); |
1188 | |
1189 | for (i = 0; i < ops_count; i++) { |
1190 | priv = nat_ops[i].priv; |
1191 | kfree_rcu(priv, rcu_head); |
1192 | } |
1193 | |
1194 | nat_proto_net->nat_hook_ops = NULL; |
1195 | kfree(objp: nat_ops); |
1196 | } |
1197 | unlock: |
1198 | mutex_unlock(lock: &nf_nat_proto_mutex); |
1199 | } |
1200 | |
1201 | static struct pernet_operations nat_net_ops = { |
1202 | .id = &nat_net_id, |
1203 | .size = sizeof(struct nat_net), |
1204 | }; |
1205 | |
1206 | static const struct nf_nat_hook nat_hook = { |
1207 | .parse_nat_setup = nfnetlink_parse_nat_setup, |
1208 | #ifdef CONFIG_XFRM |
1209 | .decode_session = __nf_nat_decode_session, |
1210 | #endif |
1211 | .manip_pkt = nf_nat_manip_pkt, |
1212 | .remove_nat_bysrc = nf_nat_cleanup_conntrack, |
1213 | }; |
1214 | |
1215 | static int __init nf_nat_init(void) |
1216 | { |
1217 | int ret, i; |
1218 | |
1219 | /* Leave them the same for the moment. */ |
1220 | nf_nat_htable_size = nf_conntrack_htable_size; |
1221 | if (nf_nat_htable_size < CONNTRACK_LOCKS) |
1222 | nf_nat_htable_size = CONNTRACK_LOCKS; |
1223 | |
1224 | nf_nat_bysource = nf_ct_alloc_hashtable(sizep: &nf_nat_htable_size, nulls: 0); |
1225 | if (!nf_nat_bysource) |
1226 | return -ENOMEM; |
1227 | |
1228 | for (i = 0; i < CONNTRACK_LOCKS; i++) |
1229 | spin_lock_init(&nf_nat_locks[i]); |
1230 | |
1231 | ret = register_pernet_subsys(&nat_net_ops); |
1232 | if (ret < 0) { |
1233 | kvfree(addr: nf_nat_bysource); |
1234 | return ret; |
1235 | } |
1236 | |
1237 | nf_ct_helper_expectfn_register(n: &follow_master_nat); |
1238 | |
1239 | WARN_ON(nf_nat_hook != NULL); |
1240 | RCU_INIT_POINTER(nf_nat_hook, &nat_hook); |
1241 | |
1242 | ret = register_nf_nat_bpf(); |
1243 | if (ret < 0) { |
1244 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1245 | nf_ct_helper_expectfn_unregister(n: &follow_master_nat); |
1246 | synchronize_net(); |
1247 | unregister_pernet_subsys(&nat_net_ops); |
1248 | kvfree(addr: nf_nat_bysource); |
1249 | } |
1250 | |
1251 | return ret; |
1252 | } |
1253 | |
1254 | static void __exit nf_nat_cleanup(void) |
1255 | { |
1256 | struct nf_nat_proto_clean clean = {}; |
1257 | |
1258 | nf_ct_iterate_destroy(iter: nf_nat_proto_clean, data: &clean); |
1259 | |
1260 | nf_ct_helper_expectfn_unregister(n: &follow_master_nat); |
1261 | RCU_INIT_POINTER(nf_nat_hook, NULL); |
1262 | |
1263 | synchronize_net(); |
1264 | kvfree(addr: nf_nat_bysource); |
1265 | unregister_pernet_subsys(&nat_net_ops); |
1266 | } |
1267 | |
1268 | MODULE_LICENSE("GPL" ); |
1269 | MODULE_DESCRIPTION("Network address translation core" ); |
1270 | |
1271 | module_init(nf_nat_init); |
1272 | module_exit(nf_nat_cleanup); |
1273 | |