1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * IPVS An implementation of the IP virtual server support for the |
4 | * LINUX operating system. IPVS is now implemented as a module |
5 | * over the Netfilter framework. IPVS can be used to build a |
6 | * high-performance and highly available server based on a |
7 | * cluster of servers. |
8 | * |
9 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
10 | * Peter Kese <peter.kese@ijs.si> |
11 | * Julian Anastasov <ja@ssi.bg> |
12 | * |
13 | * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, |
14 | * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms |
15 | * and others. Many code here is taken from IP MASQ code of kernel 2.2. |
16 | * |
17 | * Changes: |
18 | */ |
19 | |
20 | #define KMSG_COMPONENT "IPVS" |
21 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
22 | |
23 | #include <linux/interrupt.h> |
24 | #include <linux/in.h> |
25 | #include <linux/inet.h> |
26 | #include <linux/net.h> |
27 | #include <linux/kernel.h> |
28 | #include <linux/module.h> |
29 | #include <linux/proc_fs.h> /* for proc_net_* */ |
30 | #include <linux/slab.h> |
31 | #include <linux/seq_file.h> |
32 | #include <linux/jhash.h> |
33 | #include <linux/random.h> |
34 | #include <linux/rcupdate_wait.h> |
35 | |
36 | #include <net/net_namespace.h> |
37 | #include <net/ip_vs.h> |
38 | |
39 | |
40 | #ifndef CONFIG_IP_VS_TAB_BITS |
41 | #define CONFIG_IP_VS_TAB_BITS 12 |
42 | #endif |
43 | |
44 | /* |
45 | * Connection hash size. Default is what was selected at compile time. |
46 | */ |
47 | static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; |
48 | module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); |
49 | MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size" ); |
50 | |
51 | /* size and mask values */ |
52 | int ip_vs_conn_tab_size __read_mostly; |
53 | static int ip_vs_conn_tab_mask __read_mostly; |
54 | |
55 | /* |
56 | * Connection hash table: for input and output packets lookups of IPVS |
57 | */ |
58 | static struct hlist_head *ip_vs_conn_tab __read_mostly; |
59 | |
60 | /* SLAB cache for IPVS connections */ |
61 | static struct kmem_cache *ip_vs_conn_cachep __read_mostly; |
62 | |
63 | /* counter for no client port connections */ |
64 | static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); |
65 | |
66 | /* random value for IPVS connection hash */ |
67 | static unsigned int ip_vs_conn_rnd __read_mostly; |
68 | |
69 | /* |
70 | * Fine locking granularity for big connection hash table |
71 | */ |
72 | #define CT_LOCKARRAY_BITS 5 |
73 | #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) |
74 | #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) |
75 | |
76 | /* We need an addrstrlen that works with or without v6 */ |
77 | #ifdef CONFIG_IP_VS_IPV6 |
78 | #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN |
79 | #else |
80 | #define IP_VS_ADDRSTRLEN (8+1) |
81 | #endif |
82 | |
83 | struct ip_vs_aligned_lock |
84 | { |
85 | spinlock_t l; |
86 | } __attribute__((__aligned__(SMP_CACHE_BYTES))); |
87 | |
88 | /* lock array for conn table */ |
89 | static struct ip_vs_aligned_lock |
90 | __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; |
91 | |
92 | static inline void ct_write_lock_bh(unsigned int key) |
93 | { |
94 | spin_lock_bh(lock: &__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); |
95 | } |
96 | |
97 | static inline void ct_write_unlock_bh(unsigned int key) |
98 | { |
99 | spin_unlock_bh(lock: &__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); |
100 | } |
101 | |
102 | static void ip_vs_conn_expire(struct timer_list *t); |
103 | |
104 | /* |
105 | * Returns hash value for IPVS connection entry |
106 | */ |
107 | static unsigned int ip_vs_conn_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, |
108 | const union nf_inet_addr *addr, |
109 | __be16 port) |
110 | { |
111 | #ifdef CONFIG_IP_VS_IPV6 |
112 | if (af == AF_INET6) |
113 | return (jhash_3words(a: jhash(key: addr, length: 16, initval: ip_vs_conn_rnd), |
114 | b: (__force u32)port, c: proto, initval: ip_vs_conn_rnd) ^ |
115 | ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; |
116 | #endif |
117 | return (jhash_3words(a: (__force u32)addr->ip, b: (__force u32)port, c: proto, |
118 | initval: ip_vs_conn_rnd) ^ |
119 | ((size_t)ipvs>>8)) & ip_vs_conn_tab_mask; |
120 | } |
121 | |
122 | static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, |
123 | bool inverse) |
124 | { |
125 | const union nf_inet_addr *addr; |
126 | __be16 port; |
127 | |
128 | if (p->pe_data && p->pe->hashkey_raw) |
129 | return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & |
130 | ip_vs_conn_tab_mask; |
131 | |
132 | if (likely(!inverse)) { |
133 | addr = p->caddr; |
134 | port = p->cport; |
135 | } else { |
136 | addr = p->vaddr; |
137 | port = p->vport; |
138 | } |
139 | |
140 | return ip_vs_conn_hashkey(ipvs: p->ipvs, af: p->af, proto: p->protocol, addr, port); |
141 | } |
142 | |
143 | static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) |
144 | { |
145 | struct ip_vs_conn_param p; |
146 | |
147 | ip_vs_conn_fill_param(ipvs: cp->ipvs, af: cp->af, protocol: cp->protocol, |
148 | caddr: &cp->caddr, cport: cp->cport, NULL, vport: 0, p: &p); |
149 | |
150 | if (cp->pe) { |
151 | p.pe = cp->pe; |
152 | p.pe_data = cp->pe_data; |
153 | p.pe_data_len = cp->pe_data_len; |
154 | } |
155 | |
156 | return ip_vs_conn_hashkey_param(p: &p, inverse: false); |
157 | } |
158 | |
159 | /* |
160 | * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. |
161 | * returns bool success. |
162 | */ |
163 | static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) |
164 | { |
165 | unsigned int hash; |
166 | int ret; |
167 | |
168 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
169 | return 0; |
170 | |
171 | /* Hash by protocol, client address and port */ |
172 | hash = ip_vs_conn_hashkey_conn(cp); |
173 | |
174 | ct_write_lock_bh(key: hash); |
175 | spin_lock(lock: &cp->lock); |
176 | |
177 | if (!(cp->flags & IP_VS_CONN_F_HASHED)) { |
178 | cp->flags |= IP_VS_CONN_F_HASHED; |
179 | refcount_inc(r: &cp->refcnt); |
180 | hlist_add_head_rcu(n: &cp->c_list, h: &ip_vs_conn_tab[hash]); |
181 | ret = 1; |
182 | } else { |
183 | pr_err("%s(): request for already hashed, called from %pS\n" , |
184 | __func__, __builtin_return_address(0)); |
185 | ret = 0; |
186 | } |
187 | |
188 | spin_unlock(lock: &cp->lock); |
189 | ct_write_unlock_bh(key: hash); |
190 | |
191 | return ret; |
192 | } |
193 | |
194 | |
195 | /* |
196 | * UNhashes ip_vs_conn from ip_vs_conn_tab. |
197 | * returns bool success. Caller should hold conn reference. |
198 | */ |
199 | static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) |
200 | { |
201 | unsigned int hash; |
202 | int ret; |
203 | |
204 | /* unhash it and decrease its reference counter */ |
205 | hash = ip_vs_conn_hashkey_conn(cp); |
206 | |
207 | ct_write_lock_bh(key: hash); |
208 | spin_lock(lock: &cp->lock); |
209 | |
210 | if (cp->flags & IP_VS_CONN_F_HASHED) { |
211 | hlist_del_rcu(n: &cp->c_list); |
212 | cp->flags &= ~IP_VS_CONN_F_HASHED; |
213 | refcount_dec(r: &cp->refcnt); |
214 | ret = 1; |
215 | } else |
216 | ret = 0; |
217 | |
218 | spin_unlock(lock: &cp->lock); |
219 | ct_write_unlock_bh(key: hash); |
220 | |
221 | return ret; |
222 | } |
223 | |
224 | /* Try to unlink ip_vs_conn from ip_vs_conn_tab. |
225 | * returns bool success. |
226 | */ |
227 | static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) |
228 | { |
229 | unsigned int hash; |
230 | bool ret = false; |
231 | |
232 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
233 | return refcount_dec_if_one(r: &cp->refcnt); |
234 | |
235 | hash = ip_vs_conn_hashkey_conn(cp); |
236 | |
237 | ct_write_lock_bh(key: hash); |
238 | spin_lock(lock: &cp->lock); |
239 | |
240 | if (cp->flags & IP_VS_CONN_F_HASHED) { |
241 | /* Decrease refcnt and unlink conn only if we are last user */ |
242 | if (refcount_dec_if_one(r: &cp->refcnt)) { |
243 | hlist_del_rcu(n: &cp->c_list); |
244 | cp->flags &= ~IP_VS_CONN_F_HASHED; |
245 | ret = true; |
246 | } |
247 | } |
248 | |
249 | spin_unlock(lock: &cp->lock); |
250 | ct_write_unlock_bh(key: hash); |
251 | |
252 | return ret; |
253 | } |
254 | |
255 | |
256 | /* |
257 | * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. |
258 | * Called for pkts coming from OUTside-to-INside. |
259 | * p->caddr, p->cport: pkt source address (foreign host) |
260 | * p->vaddr, p->vport: pkt dest address (load balancer) |
261 | */ |
262 | static inline struct ip_vs_conn * |
263 | __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) |
264 | { |
265 | unsigned int hash; |
266 | struct ip_vs_conn *cp; |
267 | |
268 | hash = ip_vs_conn_hashkey_param(p, inverse: false); |
269 | |
270 | rcu_read_lock(); |
271 | |
272 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
273 | if (p->cport == cp->cport && p->vport == cp->vport && |
274 | cp->af == p->af && |
275 | ip_vs_addr_equal(af: p->af, a: p->caddr, b: &cp->caddr) && |
276 | ip_vs_addr_equal(af: p->af, a: p->vaddr, b: &cp->vaddr) && |
277 | ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && |
278 | p->protocol == cp->protocol && |
279 | cp->ipvs == p->ipvs) { |
280 | if (!__ip_vs_conn_get(cp)) |
281 | continue; |
282 | /* HIT */ |
283 | rcu_read_unlock(); |
284 | return cp; |
285 | } |
286 | } |
287 | |
288 | rcu_read_unlock(); |
289 | |
290 | return NULL; |
291 | } |
292 | |
293 | struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) |
294 | { |
295 | struct ip_vs_conn *cp; |
296 | |
297 | cp = __ip_vs_conn_in_get(p); |
298 | if (!cp && atomic_read(v: &ip_vs_conn_no_cport_cnt)) { |
299 | struct ip_vs_conn_param cport_zero_p = *p; |
300 | cport_zero_p.cport = 0; |
301 | cp = __ip_vs_conn_in_get(p: &cport_zero_p); |
302 | } |
303 | |
304 | IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n" , |
305 | ip_vs_proto_name(p->protocol), |
306 | IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), |
307 | IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), |
308 | cp ? "hit" : "not hit" ); |
309 | |
310 | return cp; |
311 | } |
312 | |
313 | static int |
314 | ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs, |
315 | int af, const struct sk_buff *skb, |
316 | const struct ip_vs_iphdr *iph, |
317 | struct ip_vs_conn_param *p) |
318 | { |
319 | __be16 _ports[2], *pptr; |
320 | |
321 | pptr = frag_safe_skb_hp(skb, offset: iph->len, len: sizeof(_ports), buffer: _ports); |
322 | if (pptr == NULL) |
323 | return 1; |
324 | |
325 | if (likely(!ip_vs_iph_inverse(iph))) |
326 | ip_vs_conn_fill_param(ipvs, af, protocol: iph->protocol, caddr: &iph->saddr, |
327 | cport: pptr[0], vaddr: &iph->daddr, vport: pptr[1], p); |
328 | else |
329 | ip_vs_conn_fill_param(ipvs, af, protocol: iph->protocol, caddr: &iph->daddr, |
330 | cport: pptr[1], vaddr: &iph->saddr, vport: pptr[0], p); |
331 | return 0; |
332 | } |
333 | |
334 | struct ip_vs_conn * |
335 | ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af, |
336 | const struct sk_buff *skb, |
337 | const struct ip_vs_iphdr *iph) |
338 | { |
339 | struct ip_vs_conn_param p; |
340 | |
341 | if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, p: &p)) |
342 | return NULL; |
343 | |
344 | return ip_vs_conn_in_get(p: &p); |
345 | } |
346 | EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); |
347 | |
348 | /* Get reference to connection template */ |
349 | struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) |
350 | { |
351 | unsigned int hash; |
352 | struct ip_vs_conn *cp; |
353 | |
354 | hash = ip_vs_conn_hashkey_param(p, inverse: false); |
355 | |
356 | rcu_read_lock(); |
357 | |
358 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
359 | if (unlikely(p->pe_data && p->pe->ct_match)) { |
360 | if (cp->ipvs != p->ipvs) |
361 | continue; |
362 | if (p->pe == cp->pe && p->pe->ct_match(p, cp)) { |
363 | if (__ip_vs_conn_get(cp)) |
364 | goto out; |
365 | } |
366 | continue; |
367 | } |
368 | |
369 | if (cp->af == p->af && |
370 | ip_vs_addr_equal(af: p->af, a: p->caddr, b: &cp->caddr) && |
371 | /* protocol should only be IPPROTO_IP if |
372 | * p->vaddr is a fwmark */ |
373 | ip_vs_addr_equal(af: p->protocol == IPPROTO_IP ? AF_UNSPEC : |
374 | p->af, a: p->vaddr, b: &cp->vaddr) && |
375 | p->vport == cp->vport && p->cport == cp->cport && |
376 | cp->flags & IP_VS_CONN_F_TEMPLATE && |
377 | p->protocol == cp->protocol && |
378 | cp->ipvs == p->ipvs) { |
379 | if (__ip_vs_conn_get(cp)) |
380 | goto out; |
381 | } |
382 | } |
383 | cp = NULL; |
384 | |
385 | out: |
386 | rcu_read_unlock(); |
387 | |
388 | IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n" , |
389 | ip_vs_proto_name(p->protocol), |
390 | IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), |
391 | IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), |
392 | cp ? "hit" : "not hit" ); |
393 | |
394 | return cp; |
395 | } |
396 | |
397 | /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. |
398 | * Called for pkts coming from inside-to-OUTside. |
399 | * p->caddr, p->cport: pkt source address (inside host) |
400 | * p->vaddr, p->vport: pkt dest address (foreign host) */ |
401 | struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) |
402 | { |
403 | unsigned int hash; |
404 | struct ip_vs_conn *cp, *ret=NULL; |
405 | const union nf_inet_addr *saddr; |
406 | __be16 sport; |
407 | |
408 | /* |
409 | * Check for "full" addressed entries |
410 | */ |
411 | hash = ip_vs_conn_hashkey_param(p, inverse: true); |
412 | |
413 | rcu_read_lock(); |
414 | |
415 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
416 | if (p->vport != cp->cport) |
417 | continue; |
418 | |
419 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { |
420 | sport = cp->vport; |
421 | saddr = &cp->vaddr; |
422 | } else { |
423 | sport = cp->dport; |
424 | saddr = &cp->daddr; |
425 | } |
426 | |
427 | if (p->cport == sport && cp->af == p->af && |
428 | ip_vs_addr_equal(af: p->af, a: p->vaddr, b: &cp->caddr) && |
429 | ip_vs_addr_equal(af: p->af, a: p->caddr, b: saddr) && |
430 | p->protocol == cp->protocol && |
431 | cp->ipvs == p->ipvs) { |
432 | if (!__ip_vs_conn_get(cp)) |
433 | continue; |
434 | /* HIT */ |
435 | ret = cp; |
436 | break; |
437 | } |
438 | } |
439 | |
440 | rcu_read_unlock(); |
441 | |
442 | IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n" , |
443 | ip_vs_proto_name(p->protocol), |
444 | IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), |
445 | IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), |
446 | ret ? "hit" : "not hit" ); |
447 | |
448 | return ret; |
449 | } |
450 | |
451 | struct ip_vs_conn * |
452 | ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, |
453 | const struct sk_buff *skb, |
454 | const struct ip_vs_iphdr *iph) |
455 | { |
456 | struct ip_vs_conn_param p; |
457 | |
458 | if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, p: &p)) |
459 | return NULL; |
460 | |
461 | return ip_vs_conn_out_get(p: &p); |
462 | } |
463 | EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); |
464 | |
465 | /* |
466 | * Put back the conn and restart its timer with its timeout |
467 | */ |
468 | static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) |
469 | { |
470 | unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? |
471 | 0 : cp->timeout; |
472 | mod_timer(timer: &cp->timer, expires: jiffies+t); |
473 | |
474 | __ip_vs_conn_put(cp); |
475 | } |
476 | |
477 | void ip_vs_conn_put(struct ip_vs_conn *cp) |
478 | { |
479 | if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && |
480 | (refcount_read(r: &cp->refcnt) == 1) && |
481 | !timer_pending(timer: &cp->timer)) |
482 | /* expire connection immediately */ |
483 | ip_vs_conn_expire(t: &cp->timer); |
484 | else |
485 | __ip_vs_conn_put_timer(cp); |
486 | } |
487 | |
488 | /* |
489 | * Fill a no_client_port connection with a client port number |
490 | */ |
491 | void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) |
492 | { |
493 | if (ip_vs_conn_unhash(cp)) { |
494 | spin_lock_bh(lock: &cp->lock); |
495 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) { |
496 | atomic_dec(v: &ip_vs_conn_no_cport_cnt); |
497 | cp->flags &= ~IP_VS_CONN_F_NO_CPORT; |
498 | cp->cport = cport; |
499 | } |
500 | spin_unlock_bh(lock: &cp->lock); |
501 | |
502 | /* hash on new dport */ |
503 | ip_vs_conn_hash(cp); |
504 | } |
505 | } |
506 | |
507 | |
508 | /* |
509 | * Bind a connection entry with the corresponding packet_xmit. |
510 | * Called by ip_vs_conn_new. |
511 | */ |
512 | static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) |
513 | { |
514 | switch (IP_VS_FWD_METHOD(cp)) { |
515 | case IP_VS_CONN_F_MASQ: |
516 | cp->packet_xmit = ip_vs_nat_xmit; |
517 | break; |
518 | |
519 | case IP_VS_CONN_F_TUNNEL: |
520 | #ifdef CONFIG_IP_VS_IPV6 |
521 | if (cp->daf == AF_INET6) |
522 | cp->packet_xmit = ip_vs_tunnel_xmit_v6; |
523 | else |
524 | #endif |
525 | cp->packet_xmit = ip_vs_tunnel_xmit; |
526 | break; |
527 | |
528 | case IP_VS_CONN_F_DROUTE: |
529 | cp->packet_xmit = ip_vs_dr_xmit; |
530 | break; |
531 | |
532 | case IP_VS_CONN_F_LOCALNODE: |
533 | cp->packet_xmit = ip_vs_null_xmit; |
534 | break; |
535 | |
536 | case IP_VS_CONN_F_BYPASS: |
537 | cp->packet_xmit = ip_vs_bypass_xmit; |
538 | break; |
539 | } |
540 | } |
541 | |
542 | #ifdef CONFIG_IP_VS_IPV6 |
543 | static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) |
544 | { |
545 | switch (IP_VS_FWD_METHOD(cp)) { |
546 | case IP_VS_CONN_F_MASQ: |
547 | cp->packet_xmit = ip_vs_nat_xmit_v6; |
548 | break; |
549 | |
550 | case IP_VS_CONN_F_TUNNEL: |
551 | if (cp->daf == AF_INET6) |
552 | cp->packet_xmit = ip_vs_tunnel_xmit_v6; |
553 | else |
554 | cp->packet_xmit = ip_vs_tunnel_xmit; |
555 | break; |
556 | |
557 | case IP_VS_CONN_F_DROUTE: |
558 | cp->packet_xmit = ip_vs_dr_xmit_v6; |
559 | break; |
560 | |
561 | case IP_VS_CONN_F_LOCALNODE: |
562 | cp->packet_xmit = ip_vs_null_xmit; |
563 | break; |
564 | |
565 | case IP_VS_CONN_F_BYPASS: |
566 | cp->packet_xmit = ip_vs_bypass_xmit_v6; |
567 | break; |
568 | } |
569 | } |
570 | #endif |
571 | |
572 | |
573 | static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) |
574 | { |
575 | return atomic_read(v: &dest->activeconns) |
576 | + atomic_read(v: &dest->inactconns); |
577 | } |
578 | |
579 | /* |
580 | * Bind a connection entry with a virtual service destination |
581 | * Called just after a new connection entry is created. |
582 | */ |
583 | static inline void |
584 | ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) |
585 | { |
586 | unsigned int conn_flags; |
587 | __u32 flags; |
588 | |
589 | /* if dest is NULL, then return directly */ |
590 | if (!dest) |
591 | return; |
592 | |
593 | /* Increase the refcnt counter of the dest */ |
594 | ip_vs_dest_hold(dest); |
595 | |
596 | conn_flags = atomic_read(v: &dest->conn_flags); |
597 | if (cp->protocol != IPPROTO_UDP) |
598 | conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; |
599 | flags = cp->flags; |
600 | /* Bind with the destination and its corresponding transmitter */ |
601 | if (flags & IP_VS_CONN_F_SYNC) { |
602 | /* if the connection is not template and is created |
603 | * by sync, preserve the activity flag. |
604 | */ |
605 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) |
606 | conn_flags &= ~IP_VS_CONN_F_INACTIVE; |
607 | /* connections inherit forwarding method from dest */ |
608 | flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT); |
609 | } |
610 | flags |= conn_flags; |
611 | cp->flags = flags; |
612 | cp->dest = dest; |
613 | |
614 | IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " |
615 | "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " |
616 | "dest->refcnt:%d\n" , |
617 | ip_vs_proto_name(cp->protocol), |
618 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), |
619 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), |
620 | IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), |
621 | ip_vs_fwd_tag(cp), cp->state, |
622 | cp->flags, refcount_read(&cp->refcnt), |
623 | refcount_read(&dest->refcnt)); |
624 | |
625 | /* Update the connection counters */ |
626 | if (!(flags & IP_VS_CONN_F_TEMPLATE)) { |
627 | /* It is a normal connection, so modify the counters |
628 | * according to the flags, later the protocol can |
629 | * update them on state change |
630 | */ |
631 | if (!(flags & IP_VS_CONN_F_INACTIVE)) |
632 | atomic_inc(v: &dest->activeconns); |
633 | else |
634 | atomic_inc(v: &dest->inactconns); |
635 | } else { |
636 | /* It is a persistent connection/template, so increase |
637 | the persistent connection counter */ |
638 | atomic_inc(v: &dest->persistconns); |
639 | } |
640 | |
641 | if (dest->u_threshold != 0 && |
642 | ip_vs_dest_totalconns(dest) >= dest->u_threshold) |
643 | dest->flags |= IP_VS_DEST_F_OVERLOAD; |
644 | } |
645 | |
646 | |
647 | /* |
648 | * Check if there is a destination for the connection, if so |
649 | * bind the connection to the destination. |
650 | */ |
651 | void ip_vs_try_bind_dest(struct ip_vs_conn *cp) |
652 | { |
653 | struct ip_vs_dest *dest; |
654 | |
655 | rcu_read_lock(); |
656 | |
657 | /* This function is only invoked by the synchronization code. We do |
658 | * not currently support heterogeneous pools with synchronization, |
659 | * so we can make the assumption that the svc_af is the same as the |
660 | * dest_af |
661 | */ |
662 | dest = ip_vs_find_dest(ipvs: cp->ipvs, svc_af: cp->af, dest_af: cp->af, daddr: &cp->daddr, |
663 | dport: cp->dport, vaddr: &cp->vaddr, vport: cp->vport, |
664 | protocol: cp->protocol, fwmark: cp->fwmark, flags: cp->flags); |
665 | if (dest) { |
666 | struct ip_vs_proto_data *pd; |
667 | |
668 | spin_lock_bh(lock: &cp->lock); |
669 | if (cp->dest) { |
670 | spin_unlock_bh(lock: &cp->lock); |
671 | rcu_read_unlock(); |
672 | return; |
673 | } |
674 | |
675 | /* Applications work depending on the forwarding method |
676 | * but better to reassign them always when binding dest */ |
677 | if (cp->app) |
678 | ip_vs_unbind_app(cp); |
679 | |
680 | ip_vs_bind_dest(cp, dest); |
681 | spin_unlock_bh(lock: &cp->lock); |
682 | |
683 | /* Update its packet transmitter */ |
684 | cp->packet_xmit = NULL; |
685 | #ifdef CONFIG_IP_VS_IPV6 |
686 | if (cp->af == AF_INET6) |
687 | ip_vs_bind_xmit_v6(cp); |
688 | else |
689 | #endif |
690 | ip_vs_bind_xmit(cp); |
691 | |
692 | pd = ip_vs_proto_data_get(ipvs: cp->ipvs, proto: cp->protocol); |
693 | if (pd && atomic_read(v: &pd->appcnt)) |
694 | ip_vs_bind_app(cp, pp: pd->pp); |
695 | } |
696 | rcu_read_unlock(); |
697 | } |
698 | |
699 | |
700 | /* |
701 | * Unbind a connection entry with its VS destination |
702 | * Called by the ip_vs_conn_expire function. |
703 | */ |
704 | static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) |
705 | { |
706 | struct ip_vs_dest *dest = cp->dest; |
707 | |
708 | if (!dest) |
709 | return; |
710 | |
711 | IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " |
712 | "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " |
713 | "dest->refcnt:%d\n" , |
714 | ip_vs_proto_name(cp->protocol), |
715 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), |
716 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), |
717 | IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), |
718 | ip_vs_fwd_tag(cp), cp->state, |
719 | cp->flags, refcount_read(&cp->refcnt), |
720 | refcount_read(&dest->refcnt)); |
721 | |
722 | /* Update the connection counters */ |
723 | if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { |
724 | /* It is a normal connection, so decrease the inactconns |
725 | or activeconns counter */ |
726 | if (cp->flags & IP_VS_CONN_F_INACTIVE) { |
727 | atomic_dec(v: &dest->inactconns); |
728 | } else { |
729 | atomic_dec(v: &dest->activeconns); |
730 | } |
731 | } else { |
732 | /* It is a persistent connection/template, so decrease |
733 | the persistent connection counter */ |
734 | atomic_dec(v: &dest->persistconns); |
735 | } |
736 | |
737 | if (dest->l_threshold != 0) { |
738 | if (ip_vs_dest_totalconns(dest) < dest->l_threshold) |
739 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; |
740 | } else if (dest->u_threshold != 0) { |
741 | if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) |
742 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; |
743 | } else { |
744 | if (dest->flags & IP_VS_DEST_F_OVERLOAD) |
745 | dest->flags &= ~IP_VS_DEST_F_OVERLOAD; |
746 | } |
747 | |
748 | ip_vs_dest_put(dest); |
749 | } |
750 | |
751 | static int expire_quiescent_template(struct netns_ipvs *ipvs, |
752 | struct ip_vs_dest *dest) |
753 | { |
754 | #ifdef CONFIG_SYSCTL |
755 | return ipvs->sysctl_expire_quiescent_template && |
756 | (atomic_read(v: &dest->weight) == 0); |
757 | #else |
758 | return 0; |
759 | #endif |
760 | } |
761 | |
762 | /* |
763 | * Checking if the destination of a connection template is available. |
764 | * If available, return 1, otherwise invalidate this connection |
765 | * template and return 0. |
766 | */ |
767 | int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) |
768 | { |
769 | struct ip_vs_dest *dest = ct->dest; |
770 | struct netns_ipvs *ipvs = ct->ipvs; |
771 | |
772 | /* |
773 | * Checking the dest server status. |
774 | */ |
775 | if ((dest == NULL) || |
776 | !(dest->flags & IP_VS_DEST_F_AVAILABLE) || |
777 | expire_quiescent_template(ipvs, dest) || |
778 | (cdest && (dest != cdest))) { |
779 | IP_VS_DBG_BUF(9, "check_template: dest not available for " |
780 | "protocol %s s:%s:%d v:%s:%d " |
781 | "-> d:%s:%d\n" , |
782 | ip_vs_proto_name(ct->protocol), |
783 | IP_VS_DBG_ADDR(ct->af, &ct->caddr), |
784 | ntohs(ct->cport), |
785 | IP_VS_DBG_ADDR(ct->af, &ct->vaddr), |
786 | ntohs(ct->vport), |
787 | IP_VS_DBG_ADDR(ct->daf, &ct->daddr), |
788 | ntohs(ct->dport)); |
789 | |
790 | /* |
791 | * Invalidate the connection template |
792 | */ |
793 | if (ct->vport != htons(0xffff)) { |
794 | if (ip_vs_conn_unhash(cp: ct)) { |
795 | ct->dport = htons(0xffff); |
796 | ct->vport = htons(0xffff); |
797 | ct->cport = 0; |
798 | ip_vs_conn_hash(cp: ct); |
799 | } |
800 | } |
801 | |
802 | /* |
803 | * Simply decrease the refcnt of the template, |
804 | * don't restart its timer. |
805 | */ |
806 | __ip_vs_conn_put(cp: ct); |
807 | return 0; |
808 | } |
809 | return 1; |
810 | } |
811 | |
812 | static void ip_vs_conn_rcu_free(struct rcu_head *head) |
813 | { |
814 | struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn, |
815 | rcu_head); |
816 | |
817 | ip_vs_pe_put(cp->pe); |
818 | kfree(objp: cp->pe_data); |
819 | kmem_cache_free(s: ip_vs_conn_cachep, objp: cp); |
820 | } |
821 | |
822 | /* Try to delete connection while not holding reference */ |
823 | static void ip_vs_conn_del(struct ip_vs_conn *cp) |
824 | { |
825 | if (del_timer(timer: &cp->timer)) { |
826 | /* Drop cp->control chain too */ |
827 | if (cp->control) |
828 | cp->timeout = 0; |
829 | ip_vs_conn_expire(t: &cp->timer); |
830 | } |
831 | } |
832 | |
833 | /* Try to delete connection while holding reference */ |
834 | static void ip_vs_conn_del_put(struct ip_vs_conn *cp) |
835 | { |
836 | if (del_timer(timer: &cp->timer)) { |
837 | /* Drop cp->control chain too */ |
838 | if (cp->control) |
839 | cp->timeout = 0; |
840 | __ip_vs_conn_put(cp); |
841 | ip_vs_conn_expire(t: &cp->timer); |
842 | } else { |
843 | __ip_vs_conn_put(cp); |
844 | } |
845 | } |
846 | |
847 | static void ip_vs_conn_expire(struct timer_list *t) |
848 | { |
849 | struct ip_vs_conn *cp = from_timer(cp, t, timer); |
850 | struct netns_ipvs *ipvs = cp->ipvs; |
851 | |
852 | /* |
853 | * do I control anybody? |
854 | */ |
855 | if (atomic_read(v: &cp->n_control)) |
856 | goto expire_later; |
857 | |
858 | /* Unlink conn if not referenced anymore */ |
859 | if (likely(ip_vs_conn_unlink(cp))) { |
860 | struct ip_vs_conn *ct = cp->control; |
861 | |
862 | /* delete the timer if it is activated by other users */ |
863 | del_timer(timer: &cp->timer); |
864 | |
865 | /* does anybody control me? */ |
866 | if (ct) { |
867 | bool has_ref = !cp->timeout && __ip_vs_conn_get(cp: ct); |
868 | |
869 | ip_vs_control_del(cp); |
870 | /* Drop CTL or non-assured TPL if not used anymore */ |
871 | if (has_ref && !atomic_read(v: &ct->n_control) && |
872 | (!(ct->flags & IP_VS_CONN_F_TEMPLATE) || |
873 | !(ct->state & IP_VS_CTPL_S_ASSURED))) { |
874 | IP_VS_DBG(4, "drop controlling connection\n" ); |
875 | ip_vs_conn_del_put(cp: ct); |
876 | } else if (has_ref) { |
877 | __ip_vs_conn_put(cp: ct); |
878 | } |
879 | } |
880 | |
881 | if ((cp->flags & IP_VS_CONN_F_NFCT) && |
882 | !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { |
883 | /* Do not access conntracks during subsys cleanup |
884 | * because nf_conntrack_find_get can not be used after |
885 | * conntrack cleanup for the net. |
886 | */ |
887 | smp_rmb(); |
888 | if (ipvs->enable) |
889 | ip_vs_conn_drop_conntrack(cp); |
890 | } |
891 | |
892 | if (unlikely(cp->app != NULL)) |
893 | ip_vs_unbind_app(cp); |
894 | ip_vs_unbind_dest(cp); |
895 | if (cp->flags & IP_VS_CONN_F_NO_CPORT) |
896 | atomic_dec(v: &ip_vs_conn_no_cport_cnt); |
897 | if (cp->flags & IP_VS_CONN_F_ONE_PACKET) |
898 | ip_vs_conn_rcu_free(head: &cp->rcu_head); |
899 | else |
900 | call_rcu(head: &cp->rcu_head, func: ip_vs_conn_rcu_free); |
901 | atomic_dec(v: &ipvs->conn_count); |
902 | return; |
903 | } |
904 | |
905 | expire_later: |
906 | IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n" , |
907 | refcount_read(&cp->refcnt), |
908 | atomic_read(&cp->n_control)); |
909 | |
910 | refcount_inc(r: &cp->refcnt); |
911 | cp->timeout = 60*HZ; |
912 | |
913 | if (ipvs->sync_state & IP_VS_STATE_MASTER) |
914 | ip_vs_sync_conn(ipvs, cp, pkts: sysctl_sync_threshold(ipvs)); |
915 | |
916 | __ip_vs_conn_put_timer(cp); |
917 | } |
918 | |
919 | /* Modify timer, so that it expires as soon as possible. |
920 | * Can be called without reference only if under RCU lock. |
921 | * We can have such chain of conns linked with ->control: DATA->CTL->TPL |
922 | * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup |
923 | * - cp->timeout=0 indicates all conns from chain should be dropped but |
924 | * TPL is not dropped if in assured state |
925 | */ |
926 | void ip_vs_conn_expire_now(struct ip_vs_conn *cp) |
927 | { |
928 | /* Using mod_timer_pending will ensure the timer is not |
929 | * modified after the final del_timer in ip_vs_conn_expire. |
930 | */ |
931 | if (timer_pending(timer: &cp->timer) && |
932 | time_after(cp->timer.expires, jiffies)) |
933 | mod_timer_pending(timer: &cp->timer, expires: jiffies); |
934 | } |
935 | |
936 | |
937 | /* |
938 | * Create a new connection entry and hash it into the ip_vs_conn_tab |
939 | */ |
940 | struct ip_vs_conn * |
941 | ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, |
942 | const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, |
943 | struct ip_vs_dest *dest, __u32 fwmark) |
944 | { |
945 | struct ip_vs_conn *cp; |
946 | struct netns_ipvs *ipvs = p->ipvs; |
947 | struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs: p->ipvs, |
948 | proto: p->protocol); |
949 | |
950 | cp = kmem_cache_alloc(cachep: ip_vs_conn_cachep, GFP_ATOMIC); |
951 | if (cp == NULL) { |
952 | IP_VS_ERR_RL("%s(): no memory\n" , __func__); |
953 | return NULL; |
954 | } |
955 | |
956 | INIT_HLIST_NODE(h: &cp->c_list); |
957 | timer_setup(&cp->timer, ip_vs_conn_expire, 0); |
958 | cp->ipvs = ipvs; |
959 | cp->af = p->af; |
960 | cp->daf = dest_af; |
961 | cp->protocol = p->protocol; |
962 | ip_vs_addr_set(af: p->af, dst: &cp->caddr, src: p->caddr); |
963 | cp->cport = p->cport; |
964 | /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ |
965 | ip_vs_addr_set(af: p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, |
966 | dst: &cp->vaddr, src: p->vaddr); |
967 | cp->vport = p->vport; |
968 | ip_vs_addr_set(af: cp->daf, dst: &cp->daddr, src: daddr); |
969 | cp->dport = dport; |
970 | cp->flags = flags; |
971 | cp->fwmark = fwmark; |
972 | if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { |
973 | ip_vs_pe_get(p->pe); |
974 | cp->pe = p->pe; |
975 | cp->pe_data = p->pe_data; |
976 | cp->pe_data_len = p->pe_data_len; |
977 | } else { |
978 | cp->pe = NULL; |
979 | cp->pe_data = NULL; |
980 | cp->pe_data_len = 0; |
981 | } |
982 | spin_lock_init(&cp->lock); |
983 | |
984 | /* |
985 | * Set the entry is referenced by the current thread before hashing |
986 | * it in the table, so that other thread run ip_vs_random_dropentry |
987 | * but cannot drop this entry. |
988 | */ |
989 | refcount_set(r: &cp->refcnt, n: 1); |
990 | |
991 | cp->control = NULL; |
992 | atomic_set(v: &cp->n_control, i: 0); |
993 | atomic_set(v: &cp->in_pkts, i: 0); |
994 | |
995 | cp->packet_xmit = NULL; |
996 | cp->app = NULL; |
997 | cp->app_data = NULL; |
998 | /* reset struct ip_vs_seq */ |
999 | cp->in_seq.delta = 0; |
1000 | cp->out_seq.delta = 0; |
1001 | |
1002 | atomic_inc(v: &ipvs->conn_count); |
1003 | if (flags & IP_VS_CONN_F_NO_CPORT) |
1004 | atomic_inc(v: &ip_vs_conn_no_cport_cnt); |
1005 | |
1006 | /* Bind the connection with a destination server */ |
1007 | cp->dest = NULL; |
1008 | ip_vs_bind_dest(cp, dest); |
1009 | |
1010 | /* Set its state and timeout */ |
1011 | cp->state = 0; |
1012 | cp->old_state = 0; |
1013 | cp->timeout = 3*HZ; |
1014 | cp->sync_endtime = jiffies & ~3UL; |
1015 | |
1016 | /* Bind its packet transmitter */ |
1017 | #ifdef CONFIG_IP_VS_IPV6 |
1018 | if (p->af == AF_INET6) |
1019 | ip_vs_bind_xmit_v6(cp); |
1020 | else |
1021 | #endif |
1022 | ip_vs_bind_xmit(cp); |
1023 | |
1024 | if (unlikely(pd && atomic_read(&pd->appcnt))) |
1025 | ip_vs_bind_app(cp, pp: pd->pp); |
1026 | |
1027 | /* |
1028 | * Allow conntrack to be preserved. By default, conntrack |
1029 | * is created and destroyed for every packet. |
1030 | * Sometimes keeping conntrack can be useful for |
1031 | * IP_VS_CONN_F_ONE_PACKET too. |
1032 | */ |
1033 | |
1034 | if (ip_vs_conntrack_enabled(ipvs)) |
1035 | cp->flags |= IP_VS_CONN_F_NFCT; |
1036 | |
1037 | /* Hash it in the ip_vs_conn_tab finally */ |
1038 | ip_vs_conn_hash(cp); |
1039 | |
1040 | return cp; |
1041 | } |
1042 | |
1043 | /* |
1044 | * /proc/net/ip_vs_conn entries |
1045 | */ |
1046 | #ifdef CONFIG_PROC_FS |
1047 | struct ip_vs_iter_state { |
1048 | struct seq_net_private p; |
1049 | struct hlist_head *l; |
1050 | }; |
1051 | |
1052 | static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) |
1053 | { |
1054 | int idx; |
1055 | struct ip_vs_conn *cp; |
1056 | struct ip_vs_iter_state *iter = seq->private; |
1057 | |
1058 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { |
1059 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
1060 | /* __ip_vs_conn_get() is not needed by |
1061 | * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show |
1062 | */ |
1063 | if (pos-- == 0) { |
1064 | iter->l = &ip_vs_conn_tab[idx]; |
1065 | return cp; |
1066 | } |
1067 | } |
1068 | cond_resched_rcu(); |
1069 | } |
1070 | |
1071 | return NULL; |
1072 | } |
1073 | |
1074 | static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) |
1075 | __acquires(RCU) |
1076 | { |
1077 | struct ip_vs_iter_state *iter = seq->private; |
1078 | |
1079 | iter->l = NULL; |
1080 | rcu_read_lock(); |
1081 | return *pos ? ip_vs_conn_array(seq, pos: *pos - 1) :SEQ_START_TOKEN; |
1082 | } |
1083 | |
1084 | static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
1085 | { |
1086 | struct ip_vs_conn *cp = v; |
1087 | struct ip_vs_iter_state *iter = seq->private; |
1088 | struct hlist_node *e; |
1089 | struct hlist_head *l = iter->l; |
1090 | int idx; |
1091 | |
1092 | ++*pos; |
1093 | if (v == SEQ_START_TOKEN) |
1094 | return ip_vs_conn_array(seq, pos: 0); |
1095 | |
1096 | /* more on same hash chain? */ |
1097 | e = rcu_dereference(hlist_next_rcu(&cp->c_list)); |
1098 | if (e) |
1099 | return hlist_entry(e, struct ip_vs_conn, c_list); |
1100 | |
1101 | idx = l - ip_vs_conn_tab; |
1102 | while (++idx < ip_vs_conn_tab_size) { |
1103 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
1104 | iter->l = &ip_vs_conn_tab[idx]; |
1105 | return cp; |
1106 | } |
1107 | cond_resched_rcu(); |
1108 | } |
1109 | iter->l = NULL; |
1110 | return NULL; |
1111 | } |
1112 | |
1113 | static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) |
1114 | __releases(RCU) |
1115 | { |
1116 | rcu_read_unlock(); |
1117 | } |
1118 | |
1119 | static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) |
1120 | { |
1121 | |
1122 | if (v == SEQ_START_TOKEN) |
1123 | seq_puts(m: seq, |
1124 | s: "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n" ); |
1125 | else { |
1126 | const struct ip_vs_conn *cp = v; |
1127 | struct net *net = seq_file_net(seq); |
1128 | char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; |
1129 | size_t len = 0; |
1130 | char dbuf[IP_VS_ADDRSTRLEN]; |
1131 | |
1132 | if (!net_eq(net1: cp->ipvs->net, net2: net)) |
1133 | return 0; |
1134 | if (cp->pe_data) { |
1135 | pe_data[0] = ' '; |
1136 | len = strlen(cp->pe->name); |
1137 | memcpy(pe_data + 1, cp->pe->name, len); |
1138 | pe_data[len + 1] = ' '; |
1139 | len += 2; |
1140 | len += cp->pe->show_pe_data(cp, pe_data + len); |
1141 | } |
1142 | pe_data[len] = '\0'; |
1143 | |
1144 | #ifdef CONFIG_IP_VS_IPV6 |
1145 | if (cp->daf == AF_INET6) |
1146 | snprintf(buf: dbuf, size: sizeof(dbuf), fmt: "%pI6" , &cp->daddr.in6); |
1147 | else |
1148 | #endif |
1149 | snprintf(buf: dbuf, size: sizeof(dbuf), fmt: "%08X" , |
1150 | ntohl(cp->daddr.ip)); |
1151 | |
1152 | #ifdef CONFIG_IP_VS_IPV6 |
1153 | if (cp->af == AF_INET6) |
1154 | seq_printf(m: seq, fmt: "%-3s %pI6 %04X %pI6 %04X " |
1155 | "%s %04X %-11s %7u%s\n" , |
1156 | ip_vs_proto_name(proto: cp->protocol), |
1157 | &cp->caddr.in6, ntohs(cp->cport), |
1158 | &cp->vaddr.in6, ntohs(cp->vport), |
1159 | dbuf, ntohs(cp->dport), |
1160 | ip_vs_state_name(cp), |
1161 | jiffies_delta_to_msecs(delta: cp->timer.expires - |
1162 | jiffies) / 1000, |
1163 | pe_data); |
1164 | else |
1165 | #endif |
1166 | seq_printf(m: seq, |
1167 | fmt: "%-3s %08X %04X %08X %04X" |
1168 | " %s %04X %-11s %7u%s\n" , |
1169 | ip_vs_proto_name(proto: cp->protocol), |
1170 | ntohl(cp->caddr.ip), ntohs(cp->cport), |
1171 | ntohl(cp->vaddr.ip), ntohs(cp->vport), |
1172 | dbuf, ntohs(cp->dport), |
1173 | ip_vs_state_name(cp), |
1174 | jiffies_delta_to_msecs(delta: cp->timer.expires - |
1175 | jiffies) / 1000, |
1176 | pe_data); |
1177 | } |
1178 | return 0; |
1179 | } |
1180 | |
1181 | static const struct seq_operations ip_vs_conn_seq_ops = { |
1182 | .start = ip_vs_conn_seq_start, |
1183 | .next = ip_vs_conn_seq_next, |
1184 | .stop = ip_vs_conn_seq_stop, |
1185 | .show = ip_vs_conn_seq_show, |
1186 | }; |
1187 | |
1188 | static const char *ip_vs_origin_name(unsigned int flags) |
1189 | { |
1190 | if (flags & IP_VS_CONN_F_SYNC) |
1191 | return "SYNC" ; |
1192 | else |
1193 | return "LOCAL" ; |
1194 | } |
1195 | |
1196 | static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) |
1197 | { |
1198 | char dbuf[IP_VS_ADDRSTRLEN]; |
1199 | |
1200 | if (v == SEQ_START_TOKEN) |
1201 | seq_puts(m: seq, |
1202 | s: "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n" ); |
1203 | else { |
1204 | const struct ip_vs_conn *cp = v; |
1205 | struct net *net = seq_file_net(seq); |
1206 | |
1207 | if (!net_eq(net1: cp->ipvs->net, net2: net)) |
1208 | return 0; |
1209 | |
1210 | #ifdef CONFIG_IP_VS_IPV6 |
1211 | if (cp->daf == AF_INET6) |
1212 | snprintf(buf: dbuf, size: sizeof(dbuf), fmt: "%pI6" , &cp->daddr.in6); |
1213 | else |
1214 | #endif |
1215 | snprintf(buf: dbuf, size: sizeof(dbuf), fmt: "%08X" , |
1216 | ntohl(cp->daddr.ip)); |
1217 | |
1218 | #ifdef CONFIG_IP_VS_IPV6 |
1219 | if (cp->af == AF_INET6) |
1220 | seq_printf(m: seq, fmt: "%-3s %pI6 %04X %pI6 %04X " |
1221 | "%s %04X %-11s %-6s %7u\n" , |
1222 | ip_vs_proto_name(proto: cp->protocol), |
1223 | &cp->caddr.in6, ntohs(cp->cport), |
1224 | &cp->vaddr.in6, ntohs(cp->vport), |
1225 | dbuf, ntohs(cp->dport), |
1226 | ip_vs_state_name(cp), |
1227 | ip_vs_origin_name(flags: cp->flags), |
1228 | jiffies_delta_to_msecs(delta: cp->timer.expires - |
1229 | jiffies) / 1000); |
1230 | else |
1231 | #endif |
1232 | seq_printf(m: seq, |
1233 | fmt: "%-3s %08X %04X %08X %04X " |
1234 | "%s %04X %-11s %-6s %7u\n" , |
1235 | ip_vs_proto_name(proto: cp->protocol), |
1236 | ntohl(cp->caddr.ip), ntohs(cp->cport), |
1237 | ntohl(cp->vaddr.ip), ntohs(cp->vport), |
1238 | dbuf, ntohs(cp->dport), |
1239 | ip_vs_state_name(cp), |
1240 | ip_vs_origin_name(flags: cp->flags), |
1241 | jiffies_delta_to_msecs(delta: cp->timer.expires - |
1242 | jiffies) / 1000); |
1243 | } |
1244 | return 0; |
1245 | } |
1246 | |
1247 | static const struct seq_operations ip_vs_conn_sync_seq_ops = { |
1248 | .start = ip_vs_conn_seq_start, |
1249 | .next = ip_vs_conn_seq_next, |
1250 | .stop = ip_vs_conn_seq_stop, |
1251 | .show = ip_vs_conn_sync_seq_show, |
1252 | }; |
1253 | #endif |
1254 | |
1255 | |
1256 | /* Randomly drop connection entries before running out of memory |
1257 | * Can be used for DATA and CTL conns. For TPL conns there are exceptions: |
1258 | * - traffic for services in OPS mode increases ct->in_pkts, so it is supported |
1259 | * - traffic for services not in OPS mode does not increase ct->in_pkts in |
1260 | * all cases, so it is not supported |
1261 | */ |
1262 | static inline int todrop_entry(struct ip_vs_conn *cp) |
1263 | { |
1264 | /* |
1265 | * The drop rate array needs tuning for real environments. |
1266 | * Called from timer bh only => no locking |
1267 | */ |
1268 | static const signed char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; |
1269 | static signed char todrop_counter[9] = {0}; |
1270 | int i; |
1271 | |
1272 | /* if the conn entry hasn't lasted for 60 seconds, don't drop it. |
1273 | This will leave enough time for normal connection to get |
1274 | through. */ |
1275 | if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) |
1276 | return 0; |
1277 | |
1278 | /* Don't drop the entry if its number of incoming packets is not |
1279 | located in [0, 8] */ |
1280 | i = atomic_read(v: &cp->in_pkts); |
1281 | if (i > 8 || i < 0) return 0; |
1282 | |
1283 | if (!todrop_rate[i]) return 0; |
1284 | if (--todrop_counter[i] > 0) return 0; |
1285 | |
1286 | todrop_counter[i] = todrop_rate[i]; |
1287 | return 1; |
1288 | } |
1289 | |
1290 | static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) |
1291 | { |
1292 | struct ip_vs_service *svc; |
1293 | |
1294 | if (!cp->dest) |
1295 | return false; |
1296 | svc = rcu_dereference(cp->dest->svc); |
1297 | return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); |
1298 | } |
1299 | |
1300 | /* Called from keventd and must protect itself from softirqs */ |
1301 | void ip_vs_random_dropentry(struct netns_ipvs *ipvs) |
1302 | { |
1303 | int idx; |
1304 | struct ip_vs_conn *cp; |
1305 | |
1306 | rcu_read_lock(); |
1307 | /* |
1308 | * Randomly scan 1/32 of the whole table every second |
1309 | */ |
1310 | for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { |
1311 | unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; |
1312 | |
1313 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { |
1314 | if (cp->ipvs != ipvs) |
1315 | continue; |
1316 | if (atomic_read(v: &cp->n_control)) |
1317 | continue; |
1318 | if (cp->flags & IP_VS_CONN_F_TEMPLATE) { |
1319 | /* connection template of OPS */ |
1320 | if (ip_vs_conn_ops_mode(cp)) |
1321 | goto try_drop; |
1322 | if (!(cp->state & IP_VS_CTPL_S_ASSURED)) |
1323 | goto drop; |
1324 | continue; |
1325 | } |
1326 | if (cp->protocol == IPPROTO_TCP) { |
1327 | switch(cp->state) { |
1328 | case IP_VS_TCP_S_SYN_RECV: |
1329 | case IP_VS_TCP_S_SYNACK: |
1330 | break; |
1331 | |
1332 | case IP_VS_TCP_S_ESTABLISHED: |
1333 | if (todrop_entry(cp)) |
1334 | break; |
1335 | continue; |
1336 | |
1337 | default: |
1338 | continue; |
1339 | } |
1340 | } else if (cp->protocol == IPPROTO_SCTP) { |
1341 | switch (cp->state) { |
1342 | case IP_VS_SCTP_S_INIT1: |
1343 | case IP_VS_SCTP_S_INIT: |
1344 | break; |
1345 | case IP_VS_SCTP_S_ESTABLISHED: |
1346 | if (todrop_entry(cp)) |
1347 | break; |
1348 | continue; |
1349 | default: |
1350 | continue; |
1351 | } |
1352 | } else { |
1353 | try_drop: |
1354 | if (!todrop_entry(cp)) |
1355 | continue; |
1356 | } |
1357 | |
1358 | drop: |
1359 | IP_VS_DBG(4, "drop connection\n" ); |
1360 | ip_vs_conn_del(cp); |
1361 | } |
1362 | cond_resched_rcu(); |
1363 | } |
1364 | rcu_read_unlock(); |
1365 | } |
1366 | |
1367 | |
1368 | /* |
1369 | * Flush all the connection entries in the ip_vs_conn_tab |
1370 | */ |
1371 | static void ip_vs_conn_flush(struct netns_ipvs *ipvs) |
1372 | { |
1373 | int idx; |
1374 | struct ip_vs_conn *cp, *cp_c; |
1375 | |
1376 | flush_again: |
1377 | rcu_read_lock(); |
1378 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { |
1379 | |
1380 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
1381 | if (cp->ipvs != ipvs) |
1382 | continue; |
1383 | if (atomic_read(v: &cp->n_control)) |
1384 | continue; |
1385 | cp_c = cp->control; |
1386 | IP_VS_DBG(4, "del connection\n" ); |
1387 | ip_vs_conn_del(cp); |
1388 | if (cp_c && !atomic_read(v: &cp_c->n_control)) { |
1389 | IP_VS_DBG(4, "del controlling connection\n" ); |
1390 | ip_vs_conn_del(cp: cp_c); |
1391 | } |
1392 | } |
1393 | cond_resched_rcu(); |
1394 | } |
1395 | rcu_read_unlock(); |
1396 | |
1397 | /* the counter may be not NULL, because maybe some conn entries |
1398 | are run by slow timer handler or unhashed but still referred */ |
1399 | if (atomic_read(v: &ipvs->conn_count) != 0) { |
1400 | schedule(); |
1401 | goto flush_again; |
1402 | } |
1403 | } |
1404 | |
1405 | #ifdef CONFIG_SYSCTL |
1406 | void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) |
1407 | { |
1408 | int idx; |
1409 | struct ip_vs_conn *cp, *cp_c; |
1410 | struct ip_vs_dest *dest; |
1411 | |
1412 | rcu_read_lock(); |
1413 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { |
1414 | hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { |
1415 | if (cp->ipvs != ipvs) |
1416 | continue; |
1417 | |
1418 | dest = cp->dest; |
1419 | if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE)) |
1420 | continue; |
1421 | |
1422 | if (atomic_read(v: &cp->n_control)) |
1423 | continue; |
1424 | |
1425 | cp_c = cp->control; |
1426 | IP_VS_DBG(4, "del connection\n" ); |
1427 | ip_vs_conn_del(cp); |
1428 | if (cp_c && !atomic_read(v: &cp_c->n_control)) { |
1429 | IP_VS_DBG(4, "del controlling connection\n" ); |
1430 | ip_vs_conn_del(cp: cp_c); |
1431 | } |
1432 | } |
1433 | cond_resched_rcu(); |
1434 | |
1435 | /* netns clean up started, abort delayed work */ |
1436 | if (!ipvs->enable) |
1437 | break; |
1438 | } |
1439 | rcu_read_unlock(); |
1440 | } |
1441 | #endif |
1442 | |
1443 | /* |
1444 | * per netns init and exit |
1445 | */ |
1446 | int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs) |
1447 | { |
1448 | atomic_set(v: &ipvs->conn_count, i: 0); |
1449 | |
1450 | #ifdef CONFIG_PROC_FS |
1451 | if (!proc_create_net("ip_vs_conn" , 0, ipvs->net->proc_net, |
1452 | &ip_vs_conn_seq_ops, |
1453 | sizeof(struct ip_vs_iter_state))) |
1454 | goto err_conn; |
1455 | |
1456 | if (!proc_create_net("ip_vs_conn_sync" , 0, ipvs->net->proc_net, |
1457 | &ip_vs_conn_sync_seq_ops, |
1458 | sizeof(struct ip_vs_iter_state))) |
1459 | goto err_conn_sync; |
1460 | #endif |
1461 | |
1462 | return 0; |
1463 | |
1464 | #ifdef CONFIG_PROC_FS |
1465 | err_conn_sync: |
1466 | remove_proc_entry("ip_vs_conn" , ipvs->net->proc_net); |
1467 | err_conn: |
1468 | return -ENOMEM; |
1469 | #endif |
1470 | } |
1471 | |
1472 | void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs) |
1473 | { |
1474 | /* flush all the connection entries first */ |
1475 | ip_vs_conn_flush(ipvs); |
1476 | #ifdef CONFIG_PROC_FS |
1477 | remove_proc_entry("ip_vs_conn" , ipvs->net->proc_net); |
1478 | remove_proc_entry("ip_vs_conn_sync" , ipvs->net->proc_net); |
1479 | #endif |
1480 | } |
1481 | |
1482 | int __init ip_vs_conn_init(void) |
1483 | { |
1484 | size_t tab_array_size; |
1485 | int max_avail; |
1486 | #if BITS_PER_LONG > 32 |
1487 | int max = 27; |
1488 | #else |
1489 | int max = 20; |
1490 | #endif |
1491 | int min = 8; |
1492 | int idx; |
1493 | |
1494 | max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT; |
1495 | max_avail -= 2; /* ~4 in hash row */ |
1496 | max_avail -= 1; /* IPVS up to 1/2 of mem */ |
1497 | max_avail -= order_base_2(sizeof(struct ip_vs_conn)); |
1498 | max = clamp(max, min, max_avail); |
1499 | ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); |
1500 | ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; |
1501 | ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; |
1502 | |
1503 | /* |
1504 | * Allocate the connection hash table and initialize its list heads |
1505 | */ |
1506 | tab_array_size = array_size(ip_vs_conn_tab_size, |
1507 | sizeof(*ip_vs_conn_tab)); |
1508 | ip_vs_conn_tab = kvmalloc_array(n: ip_vs_conn_tab_size, |
1509 | size: sizeof(*ip_vs_conn_tab), GFP_KERNEL); |
1510 | if (!ip_vs_conn_tab) |
1511 | return -ENOMEM; |
1512 | |
1513 | /* Allocate ip_vs_conn slab cache */ |
1514 | ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); |
1515 | if (!ip_vs_conn_cachep) { |
1516 | kvfree(addr: ip_vs_conn_tab); |
1517 | return -ENOMEM; |
1518 | } |
1519 | |
1520 | pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n" , |
1521 | ip_vs_conn_tab_size, tab_array_size / 1024); |
1522 | IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n" , |
1523 | sizeof(struct ip_vs_conn)); |
1524 | |
1525 | for (idx = 0; idx < ip_vs_conn_tab_size; idx++) |
1526 | INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); |
1527 | |
1528 | for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { |
1529 | spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l); |
1530 | } |
1531 | |
1532 | /* calculate the random value for connection hash */ |
1533 | get_random_bytes(buf: &ip_vs_conn_rnd, len: sizeof(ip_vs_conn_rnd)); |
1534 | |
1535 | return 0; |
1536 | } |
1537 | |
1538 | void ip_vs_conn_cleanup(void) |
1539 | { |
1540 | /* Wait all ip_vs_conn_rcu_free() callbacks to complete */ |
1541 | rcu_barrier(); |
1542 | /* Release the empty cache */ |
1543 | kmem_cache_destroy(s: ip_vs_conn_cachep); |
1544 | kvfree(addr: ip_vs_conn_tab); |
1545 | } |
1546 | |