1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * ip_vs_proto_tcp.c: TCP load balancing support for IPVS |
4 | * |
5 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
6 | * Julian Anastasov <ja@ssi.bg> |
7 | * |
8 | * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> |
9 | * |
10 | * Network name space (netns) aware. |
11 | * Global data moved to netns i.e struct netns_ipvs |
12 | * tcp_timeouts table has copy per netns in a hash table per |
13 | * protocol ip_vs_proto_data and is handled by netns |
14 | */ |
15 | |
16 | #define KMSG_COMPONENT "IPVS" |
17 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
18 | |
19 | #include <linux/kernel.h> |
20 | #include <linux/ip.h> |
21 | #include <linux/tcp.h> /* for tcphdr */ |
22 | #include <net/ip.h> |
23 | #include <net/tcp.h> /* for csum_tcpudp_magic */ |
24 | #include <net/ip6_checksum.h> |
25 | #include <linux/netfilter.h> |
26 | #include <linux/netfilter_ipv4.h> |
27 | #include <linux/indirect_call_wrapper.h> |
28 | |
29 | #include <net/ip_vs.h> |
30 | |
31 | static int |
32 | tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); |
33 | |
34 | static int |
35 | tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, |
36 | struct ip_vs_proto_data *pd, |
37 | int *verdict, struct ip_vs_conn **cpp, |
38 | struct ip_vs_iphdr *iph) |
39 | { |
40 | struct ip_vs_service *svc; |
41 | struct tcphdr _tcph, *th; |
42 | __be16 _ports[2], *ports = NULL; |
43 | |
44 | /* In the event of icmp, we're only guaranteed to have the first 8 |
45 | * bytes of the transport header, so we only check the rest of the |
46 | * TCP packet for non-ICMP packets |
47 | */ |
48 | if (likely(!ip_vs_iph_icmp(iph))) { |
49 | th = skb_header_pointer(skb, offset: iph->len, len: sizeof(_tcph), buffer: &_tcph); |
50 | if (th) { |
51 | if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) |
52 | return 1; |
53 | ports = &th->source; |
54 | } |
55 | } else { |
56 | ports = skb_header_pointer( |
57 | skb, offset: iph->len, len: sizeof(_ports), buffer: &_ports); |
58 | } |
59 | |
60 | if (!ports) { |
61 | *verdict = NF_DROP; |
62 | return 0; |
63 | } |
64 | |
65 | /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ |
66 | |
67 | if (likely(!ip_vs_iph_inverse(iph))) |
68 | svc = ip_vs_service_find(ipvs, af, fwmark: skb->mark, protocol: iph->protocol, |
69 | vaddr: &iph->daddr, vport: ports[1]); |
70 | else |
71 | svc = ip_vs_service_find(ipvs, af, fwmark: skb->mark, protocol: iph->protocol, |
72 | vaddr: &iph->saddr, vport: ports[0]); |
73 | |
74 | if (svc) { |
75 | int ignored; |
76 | |
77 | if (ip_vs_todrop(ipvs)) { |
78 | /* |
79 | * It seems that we are very loaded. |
80 | * We have to drop this packet :( |
81 | */ |
82 | *verdict = NF_DROP; |
83 | return 0; |
84 | } |
85 | |
86 | /* |
87 | * Let the virtual server select a real server for the |
88 | * incoming connection, and create a connection entry. |
89 | */ |
90 | *cpp = ip_vs_schedule(svc, skb, pd, ignored: &ignored, iph); |
91 | if (!*cpp && ignored <= 0) { |
92 | if (!ignored) |
93 | *verdict = ip_vs_leave(svc, skb, pd, iph); |
94 | else |
95 | *verdict = NF_DROP; |
96 | return 0; |
97 | } |
98 | } |
99 | /* NF_ACCEPT */ |
100 | return 1; |
101 | } |
102 | |
103 | |
104 | static inline void |
105 | tcp_fast_csum_update(int af, struct tcphdr *tcph, |
106 | const union nf_inet_addr *oldip, |
107 | const union nf_inet_addr *newip, |
108 | __be16 oldport, __be16 newport) |
109 | { |
110 | #ifdef CONFIG_IP_VS_IPV6 |
111 | if (af == AF_INET6) |
112 | tcph->check = |
113 | csum_fold(sum: ip_vs_check_diff16(old: oldip->ip6, new: newip->ip6, |
114 | oldsum: ip_vs_check_diff2(old: oldport, new: newport, |
115 | oldsum: ~csum_unfold(n: tcph->check)))); |
116 | else |
117 | #endif |
118 | tcph->check = |
119 | csum_fold(sum: ip_vs_check_diff4(old: oldip->ip, new: newip->ip, |
120 | oldsum: ip_vs_check_diff2(old: oldport, new: newport, |
121 | oldsum: ~csum_unfold(n: tcph->check)))); |
122 | } |
123 | |
124 | |
125 | static inline void |
126 | tcp_partial_csum_update(int af, struct tcphdr *tcph, |
127 | const union nf_inet_addr *oldip, |
128 | const union nf_inet_addr *newip, |
129 | __be16 oldlen, __be16 newlen) |
130 | { |
131 | #ifdef CONFIG_IP_VS_IPV6 |
132 | if (af == AF_INET6) |
133 | tcph->check = |
134 | ~csum_fold(sum: ip_vs_check_diff16(old: oldip->ip6, new: newip->ip6, |
135 | oldsum: ip_vs_check_diff2(old: oldlen, new: newlen, |
136 | oldsum: csum_unfold(n: tcph->check)))); |
137 | else |
138 | #endif |
139 | tcph->check = |
140 | ~csum_fold(sum: ip_vs_check_diff4(old: oldip->ip, new: newip->ip, |
141 | oldsum: ip_vs_check_diff2(old: oldlen, new: newlen, |
142 | oldsum: csum_unfold(n: tcph->check)))); |
143 | } |
144 | |
145 | |
146 | INDIRECT_CALLABLE_SCOPE int |
147 | tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, |
148 | struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) |
149 | { |
150 | struct tcphdr *tcph; |
151 | unsigned int tcphoff = iph->len; |
152 | bool payload_csum = false; |
153 | int oldlen; |
154 | |
155 | #ifdef CONFIG_IP_VS_IPV6 |
156 | if (cp->af == AF_INET6 && iph->fragoffs) |
157 | return 1; |
158 | #endif |
159 | oldlen = skb->len - tcphoff; |
160 | |
161 | /* csum_check requires unshared skb */ |
162 | if (skb_ensure_writable(skb, write_len: tcphoff + sizeof(*tcph))) |
163 | return 0; |
164 | |
165 | if (unlikely(cp->app != NULL)) { |
166 | int ret; |
167 | |
168 | /* Some checks before mangling */ |
169 | if (!tcp_csum_check(af: cp->af, skb, pp)) |
170 | return 0; |
171 | |
172 | /* Call application helper if needed */ |
173 | if (!(ret = ip_vs_app_pkt_out(cp, skb, ipvsh: iph))) |
174 | return 0; |
175 | /* ret=2: csum update is needed after payload mangling */ |
176 | if (ret == 1) |
177 | oldlen = skb->len - tcphoff; |
178 | else |
179 | payload_csum = true; |
180 | } |
181 | |
182 | tcph = (void *)skb_network_header(skb) + tcphoff; |
183 | tcph->source = cp->vport; |
184 | |
185 | /* Adjust TCP checksums */ |
186 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
187 | tcp_partial_csum_update(af: cp->af, tcph, oldip: &cp->daddr, newip: &cp->vaddr, |
188 | htons(oldlen), |
189 | htons(skb->len - tcphoff)); |
190 | } else if (!payload_csum) { |
191 | /* Only port and addr are changed, do fast csum update */ |
192 | tcp_fast_csum_update(af: cp->af, tcph, oldip: &cp->daddr, newip: &cp->vaddr, |
193 | oldport: cp->dport, newport: cp->vport); |
194 | if (skb->ip_summed == CHECKSUM_COMPLETE) |
195 | skb->ip_summed = cp->app ? |
196 | CHECKSUM_UNNECESSARY : CHECKSUM_NONE; |
197 | } else { |
198 | /* full checksum calculation */ |
199 | tcph->check = 0; |
200 | skb->csum = skb_checksum(skb, offset: tcphoff, len: skb->len - tcphoff, csum: 0); |
201 | #ifdef CONFIG_IP_VS_IPV6 |
202 | if (cp->af == AF_INET6) |
203 | tcph->check = csum_ipv6_magic(saddr: &cp->vaddr.in6, |
204 | daddr: &cp->caddr.in6, |
205 | len: skb->len - tcphoff, |
206 | proto: cp->protocol, sum: skb->csum); |
207 | else |
208 | #endif |
209 | tcph->check = csum_tcpudp_magic(saddr: cp->vaddr.ip, |
210 | daddr: cp->caddr.ip, |
211 | len: skb->len - tcphoff, |
212 | proto: cp->protocol, |
213 | sum: skb->csum); |
214 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
215 | |
216 | IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n" , |
217 | pp->name, tcph->check, |
218 | (char*)&(tcph->check) - (char*)tcph); |
219 | } |
220 | return 1; |
221 | } |
222 | |
223 | |
224 | static int |
225 | tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, |
226 | struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) |
227 | { |
228 | struct tcphdr *tcph; |
229 | unsigned int tcphoff = iph->len; |
230 | bool payload_csum = false; |
231 | int oldlen; |
232 | |
233 | #ifdef CONFIG_IP_VS_IPV6 |
234 | if (cp->af == AF_INET6 && iph->fragoffs) |
235 | return 1; |
236 | #endif |
237 | oldlen = skb->len - tcphoff; |
238 | |
239 | /* csum_check requires unshared skb */ |
240 | if (skb_ensure_writable(skb, write_len: tcphoff + sizeof(*tcph))) |
241 | return 0; |
242 | |
243 | if (unlikely(cp->app != NULL)) { |
244 | int ret; |
245 | |
246 | /* Some checks before mangling */ |
247 | if (!tcp_csum_check(af: cp->af, skb, pp)) |
248 | return 0; |
249 | |
250 | /* |
251 | * Attempt ip_vs_app call. |
252 | * It will fix ip_vs_conn and iph ack_seq stuff |
253 | */ |
254 | if (!(ret = ip_vs_app_pkt_in(cp, skb, ipvsh: iph))) |
255 | return 0; |
256 | /* ret=2: csum update is needed after payload mangling */ |
257 | if (ret == 1) |
258 | oldlen = skb->len - tcphoff; |
259 | else |
260 | payload_csum = true; |
261 | } |
262 | |
263 | tcph = (void *)skb_network_header(skb) + tcphoff; |
264 | tcph->dest = cp->dport; |
265 | |
266 | /* |
267 | * Adjust TCP checksums |
268 | */ |
269 | if (skb->ip_summed == CHECKSUM_PARTIAL) { |
270 | tcp_partial_csum_update(af: cp->af, tcph, oldip: &cp->vaddr, newip: &cp->daddr, |
271 | htons(oldlen), |
272 | htons(skb->len - tcphoff)); |
273 | } else if (!payload_csum) { |
274 | /* Only port and addr are changed, do fast csum update */ |
275 | tcp_fast_csum_update(af: cp->af, tcph, oldip: &cp->vaddr, newip: &cp->daddr, |
276 | oldport: cp->vport, newport: cp->dport); |
277 | if (skb->ip_summed == CHECKSUM_COMPLETE) |
278 | skb->ip_summed = cp->app ? |
279 | CHECKSUM_UNNECESSARY : CHECKSUM_NONE; |
280 | } else { |
281 | /* full checksum calculation */ |
282 | tcph->check = 0; |
283 | skb->csum = skb_checksum(skb, offset: tcphoff, len: skb->len - tcphoff, csum: 0); |
284 | #ifdef CONFIG_IP_VS_IPV6 |
285 | if (cp->af == AF_INET6) |
286 | tcph->check = csum_ipv6_magic(saddr: &cp->caddr.in6, |
287 | daddr: &cp->daddr.in6, |
288 | len: skb->len - tcphoff, |
289 | proto: cp->protocol, sum: skb->csum); |
290 | else |
291 | #endif |
292 | tcph->check = csum_tcpudp_magic(saddr: cp->caddr.ip, |
293 | daddr: cp->daddr.ip, |
294 | len: skb->len - tcphoff, |
295 | proto: cp->protocol, |
296 | sum: skb->csum); |
297 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
298 | } |
299 | return 1; |
300 | } |
301 | |
302 | |
303 | static int |
304 | tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) |
305 | { |
306 | unsigned int tcphoff; |
307 | |
308 | #ifdef CONFIG_IP_VS_IPV6 |
309 | if (af == AF_INET6) |
310 | tcphoff = sizeof(struct ipv6hdr); |
311 | else |
312 | #endif |
313 | tcphoff = ip_hdrlen(skb); |
314 | |
315 | switch (skb->ip_summed) { |
316 | case CHECKSUM_NONE: |
317 | skb->csum = skb_checksum(skb, offset: tcphoff, len: skb->len - tcphoff, csum: 0); |
318 | fallthrough; |
319 | case CHECKSUM_COMPLETE: |
320 | #ifdef CONFIG_IP_VS_IPV6 |
321 | if (af == AF_INET6) { |
322 | if (csum_ipv6_magic(saddr: &ipv6_hdr(skb)->saddr, |
323 | daddr: &ipv6_hdr(skb)->daddr, |
324 | len: skb->len - tcphoff, |
325 | proto: ipv6_hdr(skb)->nexthdr, |
326 | sum: skb->csum)) { |
327 | IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, |
328 | "Failed checksum for" ); |
329 | return 0; |
330 | } |
331 | } else |
332 | #endif |
333 | if (csum_tcpudp_magic(saddr: ip_hdr(skb)->saddr, |
334 | daddr: ip_hdr(skb)->daddr, |
335 | len: skb->len - tcphoff, |
336 | proto: ip_hdr(skb)->protocol, |
337 | sum: skb->csum)) { |
338 | IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, |
339 | "Failed checksum for" ); |
340 | return 0; |
341 | } |
342 | break; |
343 | default: |
344 | /* No need to checksum. */ |
345 | break; |
346 | } |
347 | |
348 | return 1; |
349 | } |
350 | |
351 | |
352 | #define TCP_DIR_INPUT 0 |
353 | #define TCP_DIR_OUTPUT 4 |
354 | #define TCP_DIR_INPUT_ONLY 8 |
355 | |
356 | static const int tcp_state_off[IP_VS_DIR_LAST] = { |
357 | [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, |
358 | [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, |
359 | [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, |
360 | }; |
361 | |
362 | /* |
363 | * Timeout table[state] |
364 | */ |
365 | static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { |
366 | [IP_VS_TCP_S_NONE] = 2*HZ, |
367 | [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, |
368 | [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, |
369 | [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, |
370 | [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, |
371 | [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, |
372 | [IP_VS_TCP_S_CLOSE] = 10*HZ, |
373 | [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, |
374 | [IP_VS_TCP_S_LAST_ACK] = 30*HZ, |
375 | [IP_VS_TCP_S_LISTEN] = 2*60*HZ, |
376 | [IP_VS_TCP_S_SYNACK] = 120*HZ, |
377 | [IP_VS_TCP_S_LAST] = 2*HZ, |
378 | }; |
379 | |
380 | static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { |
381 | [IP_VS_TCP_S_NONE] = "NONE" , |
382 | [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED" , |
383 | [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT" , |
384 | [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV" , |
385 | [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT" , |
386 | [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT" , |
387 | [IP_VS_TCP_S_CLOSE] = "CLOSE" , |
388 | [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT" , |
389 | [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK" , |
390 | [IP_VS_TCP_S_LISTEN] = "LISTEN" , |
391 | [IP_VS_TCP_S_SYNACK] = "SYNACK" , |
392 | [IP_VS_TCP_S_LAST] = "BUG!" , |
393 | }; |
394 | |
395 | static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { |
396 | [IP_VS_TCP_S_NONE] = false, |
397 | [IP_VS_TCP_S_ESTABLISHED] = true, |
398 | [IP_VS_TCP_S_SYN_SENT] = true, |
399 | [IP_VS_TCP_S_SYN_RECV] = true, |
400 | [IP_VS_TCP_S_FIN_WAIT] = false, |
401 | [IP_VS_TCP_S_TIME_WAIT] = false, |
402 | [IP_VS_TCP_S_CLOSE] = false, |
403 | [IP_VS_TCP_S_CLOSE_WAIT] = false, |
404 | [IP_VS_TCP_S_LAST_ACK] = false, |
405 | [IP_VS_TCP_S_LISTEN] = false, |
406 | [IP_VS_TCP_S_SYNACK] = true, |
407 | }; |
408 | |
409 | #define sNO IP_VS_TCP_S_NONE |
410 | #define sES IP_VS_TCP_S_ESTABLISHED |
411 | #define sSS IP_VS_TCP_S_SYN_SENT |
412 | #define sSR IP_VS_TCP_S_SYN_RECV |
413 | #define sFW IP_VS_TCP_S_FIN_WAIT |
414 | #define sTW IP_VS_TCP_S_TIME_WAIT |
415 | #define sCL IP_VS_TCP_S_CLOSE |
416 | #define sCW IP_VS_TCP_S_CLOSE_WAIT |
417 | #define sLA IP_VS_TCP_S_LAST_ACK |
418 | #define sLI IP_VS_TCP_S_LISTEN |
419 | #define sSA IP_VS_TCP_S_SYNACK |
420 | |
421 | struct tcp_states_t { |
422 | int next_state[IP_VS_TCP_S_LAST]; |
423 | }; |
424 | |
425 | static const char * tcp_state_name(int state) |
426 | { |
427 | if (state >= IP_VS_TCP_S_LAST) |
428 | return "ERR!" ; |
429 | return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?" ; |
430 | } |
431 | |
432 | static bool tcp_state_active(int state) |
433 | { |
434 | if (state >= IP_VS_TCP_S_LAST) |
435 | return false; |
436 | return tcp_state_active_table[state]; |
437 | } |
438 | |
439 | static struct tcp_states_t tcp_states[] = { |
440 | /* INPUT */ |
441 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
442 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, |
443 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, |
444 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, |
445 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, |
446 | |
447 | /* OUTPUT */ |
448 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
449 | /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, |
450 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, |
451 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, |
452 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, |
453 | |
454 | /* INPUT-ONLY */ |
455 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
456 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, |
457 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, |
458 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, |
459 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, |
460 | }; |
461 | |
462 | static struct tcp_states_t tcp_states_dos[] = { |
463 | /* INPUT */ |
464 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
465 | /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, |
466 | /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, |
467 | /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, |
468 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, |
469 | |
470 | /* OUTPUT */ |
471 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
472 | /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, |
473 | /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, |
474 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, |
475 | /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, |
476 | |
477 | /* INPUT-ONLY */ |
478 | /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ |
479 | /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, |
480 | /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, |
481 | /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, |
482 | /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, |
483 | }; |
484 | |
485 | static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) |
486 | { |
487 | int on = (flags & 1); /* secure_tcp */ |
488 | |
489 | /* |
490 | ** FIXME: change secure_tcp to independent sysctl var |
491 | ** or make it per-service or per-app because it is valid |
492 | ** for most if not for all of the applications. Something |
493 | ** like "capabilities" (flags) for each object. |
494 | */ |
495 | pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); |
496 | } |
497 | |
498 | static inline int tcp_state_idx(struct tcphdr *th) |
499 | { |
500 | if (th->rst) |
501 | return 3; |
502 | if (th->syn) |
503 | return 0; |
504 | if (th->fin) |
505 | return 1; |
506 | if (th->ack) |
507 | return 2; |
508 | return -1; |
509 | } |
510 | |
511 | static inline void |
512 | set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, |
513 | int direction, struct tcphdr *th) |
514 | { |
515 | int state_idx; |
516 | int new_state = IP_VS_TCP_S_CLOSE; |
517 | int state_off = tcp_state_off[direction]; |
518 | |
519 | /* |
520 | * Update state offset to INPUT_ONLY if necessary |
521 | * or delete NO_OUTPUT flag if output packet detected |
522 | */ |
523 | if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { |
524 | if (state_off == TCP_DIR_OUTPUT) |
525 | cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; |
526 | else |
527 | state_off = TCP_DIR_INPUT_ONLY; |
528 | } |
529 | |
530 | if ((state_idx = tcp_state_idx(th)) < 0) { |
531 | IP_VS_DBG(8, "tcp_state_idx=%d!!!\n" , state_idx); |
532 | goto tcp_state_out; |
533 | } |
534 | |
535 | new_state = |
536 | pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; |
537 | |
538 | tcp_state_out: |
539 | if (new_state != cp->state) { |
540 | struct ip_vs_dest *dest = cp->dest; |
541 | |
542 | IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " |
543 | "d:%s:%d state: %s->%s conn->refcnt:%d\n" , |
544 | pd->pp->name, |
545 | ((state_off == TCP_DIR_OUTPUT) ? |
546 | "output " : "input " ), |
547 | th->syn ? 'S' : '.', |
548 | th->fin ? 'F' : '.', |
549 | th->ack ? 'A' : '.', |
550 | th->rst ? 'R' : '.', |
551 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), |
552 | ntohs(cp->cport), |
553 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), |
554 | ntohs(cp->vport), |
555 | IP_VS_DBG_ADDR(cp->daf, &cp->daddr), |
556 | ntohs(cp->dport), |
557 | tcp_state_name(cp->state), |
558 | tcp_state_name(new_state), |
559 | refcount_read(&cp->refcnt)); |
560 | |
561 | if (dest) { |
562 | if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && |
563 | !tcp_state_active(state: new_state)) { |
564 | atomic_dec(v: &dest->activeconns); |
565 | atomic_inc(v: &dest->inactconns); |
566 | cp->flags |= IP_VS_CONN_F_INACTIVE; |
567 | } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && |
568 | tcp_state_active(state: new_state)) { |
569 | atomic_inc(v: &dest->activeconns); |
570 | atomic_dec(v: &dest->inactconns); |
571 | cp->flags &= ~IP_VS_CONN_F_INACTIVE; |
572 | } |
573 | } |
574 | if (new_state == IP_VS_TCP_S_ESTABLISHED) |
575 | ip_vs_control_assure_ct(cp); |
576 | } |
577 | |
578 | if (likely(pd)) |
579 | cp->timeout = pd->timeout_table[cp->state = new_state]; |
580 | else /* What to do ? */ |
581 | cp->timeout = tcp_timeouts[cp->state = new_state]; |
582 | } |
583 | |
584 | /* |
585 | * Handle state transitions |
586 | */ |
587 | static void |
588 | tcp_state_transition(struct ip_vs_conn *cp, int direction, |
589 | const struct sk_buff *skb, |
590 | struct ip_vs_proto_data *pd) |
591 | { |
592 | struct tcphdr _tcph, *th; |
593 | |
594 | #ifdef CONFIG_IP_VS_IPV6 |
595 | int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); |
596 | #else |
597 | int ihl = ip_hdrlen(skb); |
598 | #endif |
599 | |
600 | th = skb_header_pointer(skb, offset: ihl, len: sizeof(_tcph), buffer: &_tcph); |
601 | if (th == NULL) |
602 | return; |
603 | |
604 | spin_lock_bh(lock: &cp->lock); |
605 | set_tcp_state(pd, cp, direction, th); |
606 | spin_unlock_bh(lock: &cp->lock); |
607 | } |
608 | |
609 | static inline __u16 tcp_app_hashkey(__be16 port) |
610 | { |
611 | return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) |
612 | & TCP_APP_TAB_MASK; |
613 | } |
614 | |
615 | |
616 | static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) |
617 | { |
618 | struct ip_vs_app *i; |
619 | __u16 hash; |
620 | __be16 port = inc->port; |
621 | int ret = 0; |
622 | struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); |
623 | |
624 | hash = tcp_app_hashkey(port); |
625 | |
626 | list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { |
627 | if (i->port == port) { |
628 | ret = -EEXIST; |
629 | goto out; |
630 | } |
631 | } |
632 | list_add_rcu(new: &inc->p_list, head: &ipvs->tcp_apps[hash]); |
633 | atomic_inc(v: &pd->appcnt); |
634 | |
635 | out: |
636 | return ret; |
637 | } |
638 | |
639 | |
640 | static void |
641 | tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) |
642 | { |
643 | struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); |
644 | |
645 | atomic_dec(v: &pd->appcnt); |
646 | list_del_rcu(entry: &inc->p_list); |
647 | } |
648 | |
649 | |
650 | static int |
651 | tcp_app_conn_bind(struct ip_vs_conn *cp) |
652 | { |
653 | struct netns_ipvs *ipvs = cp->ipvs; |
654 | int hash; |
655 | struct ip_vs_app *inc; |
656 | int result = 0; |
657 | |
658 | /* Default binding: bind app only for NAT */ |
659 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) |
660 | return 0; |
661 | |
662 | /* Lookup application incarnations and bind the right one */ |
663 | hash = tcp_app_hashkey(port: cp->vport); |
664 | |
665 | list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { |
666 | if (inc->port == cp->vport) { |
667 | if (unlikely(!ip_vs_app_inc_get(inc))) |
668 | break; |
669 | |
670 | IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" |
671 | "%s:%u to app %s on port %u\n" , |
672 | __func__, |
673 | IP_VS_DBG_ADDR(cp->af, &cp->caddr), |
674 | ntohs(cp->cport), |
675 | IP_VS_DBG_ADDR(cp->af, &cp->vaddr), |
676 | ntohs(cp->vport), |
677 | inc->name, ntohs(inc->port)); |
678 | |
679 | cp->app = inc; |
680 | if (inc->init_conn) |
681 | result = inc->init_conn(inc, cp); |
682 | break; |
683 | } |
684 | } |
685 | |
686 | return result; |
687 | } |
688 | |
689 | |
690 | /* |
691 | * Set LISTEN timeout. (ip_vs_conn_put will setup timer) |
692 | */ |
693 | void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) |
694 | { |
695 | struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs: cp->ipvs, IPPROTO_TCP); |
696 | |
697 | spin_lock_bh(lock: &cp->lock); |
698 | cp->state = IP_VS_TCP_S_LISTEN; |
699 | cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] |
700 | : tcp_timeouts[IP_VS_TCP_S_LISTEN]); |
701 | spin_unlock_bh(lock: &cp->lock); |
702 | } |
703 | |
704 | /* --------------------------------------------- |
705 | * timeouts is netns related now. |
706 | * --------------------------------------------- |
707 | */ |
708 | static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) |
709 | { |
710 | ip_vs_init_hash_table(table: ipvs->tcp_apps, TCP_APP_TAB_SIZE); |
711 | pd->timeout_table = ip_vs_create_timeout_table(table: (int *)tcp_timeouts, |
712 | size: sizeof(tcp_timeouts)); |
713 | if (!pd->timeout_table) |
714 | return -ENOMEM; |
715 | pd->tcp_state_table = tcp_states; |
716 | return 0; |
717 | } |
718 | |
719 | static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) |
720 | { |
721 | kfree(objp: pd->timeout_table); |
722 | } |
723 | |
724 | |
725 | struct ip_vs_protocol ip_vs_protocol_tcp = { |
726 | .name = "TCP" , |
727 | .protocol = IPPROTO_TCP, |
728 | .num_states = IP_VS_TCP_S_LAST, |
729 | .dont_defrag = 0, |
730 | .init = NULL, |
731 | .exit = NULL, |
732 | .init_netns = __ip_vs_tcp_init, |
733 | .exit_netns = __ip_vs_tcp_exit, |
734 | .register_app = tcp_register_app, |
735 | .unregister_app = tcp_unregister_app, |
736 | .conn_schedule = tcp_conn_schedule, |
737 | .conn_in_get = ip_vs_conn_in_get_proto, |
738 | .conn_out_get = ip_vs_conn_out_get_proto, |
739 | .snat_handler = tcp_snat_handler, |
740 | .dnat_handler = tcp_dnat_handler, |
741 | .state_name = tcp_state_name, |
742 | .state_transition = tcp_state_transition, |
743 | .app_conn_bind = tcp_app_conn_bind, |
744 | .debug_packet = ip_vs_tcpudp_debug_packet, |
745 | .timeout_change = tcp_timeout_change, |
746 | }; |
747 | |