1 | /* |
2 | * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. |
3 | * |
4 | * This software is available to you under a choice of one of two |
5 | * licenses. You may choose to be licensed under the terms of the GNU |
6 | * General Public License (GPL) Version 2, available from the file |
7 | * COPYING in the main directory of this source tree, or the |
8 | * OpenIB.org BSD license below: |
9 | * |
10 | * Redistribution and use in source and binary forms, with or |
11 | * without modification, are permitted provided that the following |
12 | * conditions are met: |
13 | * |
14 | * - Redistributions of source code must retain the above |
15 | * copyright notice, this list of conditions and the following |
16 | * disclaimer. |
17 | * |
18 | * - Redistributions in binary form must reproduce the above |
19 | * copyright notice, this list of conditions and the following |
20 | * disclaimer in the documentation and/or other materials |
21 | * provided with the distribution. |
22 | * |
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
30 | * SOFTWARE. |
31 | * |
32 | */ |
33 | #include <linux/kernel.h> |
34 | #include <linux/in.h> |
35 | #include <net/tcp.h> |
36 | |
37 | #include "rds.h" |
38 | #include "tcp.h" |
39 | |
40 | void rds_tcp_state_change(struct sock *sk) |
41 | { |
42 | void (*state_change)(struct sock *sk); |
43 | struct rds_conn_path *cp; |
44 | struct rds_tcp_connection *tc; |
45 | |
46 | read_lock_bh(&sk->sk_callback_lock); |
47 | cp = sk->sk_user_data; |
48 | if (!cp) { |
49 | state_change = sk->sk_state_change; |
50 | goto out; |
51 | } |
52 | tc = cp->cp_transport_data; |
53 | state_change = tc->t_orig_state_change; |
54 | |
55 | rdsdebug("sock %p state_change to %d\n" , tc->t_sock, sk->sk_state); |
56 | |
57 | switch (sk->sk_state) { |
58 | /* ignore connecting sockets as they make progress */ |
59 | case TCP_SYN_SENT: |
60 | case TCP_SYN_RECV: |
61 | break; |
62 | case TCP_ESTABLISHED: |
63 | /* Force the peer to reconnect so that we have the |
64 | * TCP ports going from <smaller-ip>.<transient> to |
65 | * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the |
66 | * RDS connection as RDS_CONN_UP until the reconnect, |
67 | * to avoid RDS datagram loss. |
68 | */ |
69 | if (rds_addr_cmp(a1: &cp->cp_conn->c_laddr, |
70 | a2: &cp->cp_conn->c_faddr) >= 0 && |
71 | rds_conn_path_transition(cp, old: RDS_CONN_CONNECTING, |
72 | new: RDS_CONN_ERROR)) { |
73 | rds_conn_path_drop(cpath: cp, destroy: false); |
74 | } else { |
75 | rds_connect_path_complete(conn: cp, curr: RDS_CONN_CONNECTING); |
76 | } |
77 | break; |
78 | case TCP_CLOSE_WAIT: |
79 | case TCP_CLOSE: |
80 | rds_conn_path_drop(cpath: cp, destroy: false); |
81 | break; |
82 | default: |
83 | break; |
84 | } |
85 | out: |
86 | read_unlock_bh(&sk->sk_callback_lock); |
87 | state_change(sk); |
88 | } |
89 | |
90 | int rds_tcp_conn_path_connect(struct rds_conn_path *cp) |
91 | { |
92 | struct socket *sock = NULL; |
93 | struct sockaddr_in6 sin6; |
94 | struct sockaddr_in sin; |
95 | struct sockaddr *addr; |
96 | int addrlen; |
97 | bool isv6; |
98 | int ret; |
99 | struct rds_connection *conn = cp->cp_conn; |
100 | struct rds_tcp_connection *tc = cp->cp_transport_data; |
101 | |
102 | /* for multipath rds,we only trigger the connection after |
103 | * the handshake probe has determined the number of paths. |
104 | */ |
105 | if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) |
106 | return -EAGAIN; |
107 | |
108 | mutex_lock(&tc->t_conn_path_lock); |
109 | |
110 | if (rds_conn_path_up(cp)) { |
111 | mutex_unlock(lock: &tc->t_conn_path_lock); |
112 | return 0; |
113 | } |
114 | if (ipv6_addr_v4mapped(a: &conn->c_laddr)) { |
115 | ret = sock_create_kern(net: rds_conn_net(conn), PF_INET, |
116 | type: SOCK_STREAM, IPPROTO_TCP, res: &sock); |
117 | isv6 = false; |
118 | } else { |
119 | ret = sock_create_kern(net: rds_conn_net(conn), PF_INET6, |
120 | type: SOCK_STREAM, IPPROTO_TCP, res: &sock); |
121 | isv6 = true; |
122 | } |
123 | |
124 | if (ret < 0) |
125 | goto out; |
126 | |
127 | if (!rds_tcp_tune(sock)) { |
128 | ret = -EINVAL; |
129 | goto out; |
130 | } |
131 | |
132 | if (isv6) { |
133 | sin6.sin6_family = AF_INET6; |
134 | sin6.sin6_addr = conn->c_laddr; |
135 | sin6.sin6_port = 0; |
136 | sin6.sin6_flowinfo = 0; |
137 | sin6.sin6_scope_id = conn->c_dev_if; |
138 | addr = (struct sockaddr *)&sin6; |
139 | addrlen = sizeof(sin6); |
140 | } else { |
141 | sin.sin_family = AF_INET; |
142 | sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; |
143 | sin.sin_port = 0; |
144 | addr = (struct sockaddr *)&sin; |
145 | addrlen = sizeof(sin); |
146 | } |
147 | |
148 | ret = kernel_bind(sock, addr, addrlen); |
149 | if (ret) { |
150 | rdsdebug("bind failed with %d at address %pI6c\n" , |
151 | ret, &conn->c_laddr); |
152 | goto out; |
153 | } |
154 | |
155 | if (isv6) { |
156 | sin6.sin6_family = AF_INET6; |
157 | sin6.sin6_addr = conn->c_faddr; |
158 | sin6.sin6_port = htons(RDS_TCP_PORT); |
159 | sin6.sin6_flowinfo = 0; |
160 | sin6.sin6_scope_id = conn->c_dev_if; |
161 | addr = (struct sockaddr *)&sin6; |
162 | addrlen = sizeof(sin6); |
163 | } else { |
164 | sin.sin_family = AF_INET; |
165 | sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; |
166 | sin.sin_port = htons(RDS_TCP_PORT); |
167 | addr = (struct sockaddr *)&sin; |
168 | addrlen = sizeof(sin); |
169 | } |
170 | |
171 | /* |
172 | * once we call connect() we can start getting callbacks and they |
173 | * own the socket |
174 | */ |
175 | rds_tcp_set_callbacks(sock, cp); |
176 | ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); |
177 | |
178 | rdsdebug("connect to address %pI6c returned %d\n" , &conn->c_faddr, ret); |
179 | if (ret == -EINPROGRESS) |
180 | ret = 0; |
181 | if (ret == 0) { |
182 | rds_tcp_keepalive(sock); |
183 | sock = NULL; |
184 | } else { |
185 | rds_tcp_restore_callbacks(sock, tc: cp->cp_transport_data); |
186 | } |
187 | |
188 | out: |
189 | mutex_unlock(lock: &tc->t_conn_path_lock); |
190 | if (sock) |
191 | sock_release(sock); |
192 | return ret; |
193 | } |
194 | |
195 | /* |
196 | * Before killing the tcp socket this needs to serialize with callbacks. The |
197 | * caller has already grabbed the sending sem so we're serialized with other |
198 | * senders. |
199 | * |
200 | * TCP calls the callbacks with the sock lock so we hold it while we reset the |
201 | * callbacks to those set by TCP. Our callbacks won't execute again once we |
202 | * hold the sock lock. |
203 | */ |
204 | void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) |
205 | { |
206 | struct rds_tcp_connection *tc = cp->cp_transport_data; |
207 | struct socket *sock = tc->t_sock; |
208 | |
209 | rdsdebug("shutting down conn %p tc %p sock %p\n" , |
210 | cp->cp_conn, tc, sock); |
211 | |
212 | if (sock) { |
213 | if (rds_destroy_pending(conn: cp->cp_conn)) |
214 | sock_no_linger(sk: sock->sk); |
215 | sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); |
216 | lock_sock(sk: sock->sk); |
217 | rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ |
218 | |
219 | release_sock(sk: sock->sk); |
220 | sock_release(sock); |
221 | } |
222 | |
223 | if (tc->t_tinc) { |
224 | rds_inc_put(inc: &tc->t_tinc->ti_inc); |
225 | tc->t_tinc = NULL; |
226 | } |
227 | tc->t_tinc_hdr_rem = sizeof(struct rds_header); |
228 | tc->t_tinc_data_rem = 0; |
229 | } |
230 | |