1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | /* |
3 | * IPv6 IOAM Lightweight Tunnel implementation |
4 | * |
5 | * Author: |
6 | * Justin Iurman <justin.iurman@uliege.be> |
7 | */ |
8 | |
9 | #include <linux/kernel.h> |
10 | #include <linux/skbuff.h> |
11 | #include <linux/net.h> |
12 | #include <linux/in6.h> |
13 | #include <linux/ioam6.h> |
14 | #include <linux/ioam6_iptunnel.h> |
15 | #include <net/dst.h> |
16 | #include <net/sock.h> |
17 | #include <net/lwtunnel.h> |
18 | #include <net/ioam6.h> |
19 | #include <net/netlink.h> |
20 | #include <net/ipv6.h> |
21 | #include <net/dst_cache.h> |
22 | #include <net/ip6_route.h> |
23 | #include <net/addrconf.h> |
24 | |
25 | #define IOAM6_MASK_SHORT_FIELDS 0xff100000 |
26 | #define IOAM6_MASK_WIDE_FIELDS 0xe00000 |
27 | |
28 | struct ioam6_lwt_encap { |
29 | struct ipv6_hopopt_hdr eh; |
30 | u8 pad[2]; /* 2-octet padding for 4n-alignment */ |
31 | struct ioam6_hdr ioamh; |
32 | struct ioam6_trace_hdr traceh; |
33 | } __packed; |
34 | |
35 | struct ioam6_lwt_freq { |
36 | u32 k; |
37 | u32 n; |
38 | }; |
39 | |
40 | struct ioam6_lwt { |
41 | struct dst_cache cache; |
42 | struct ioam6_lwt_freq freq; |
43 | atomic_t pkt_cnt; |
44 | u8 mode; |
45 | struct in6_addr tundst; |
46 | struct ioam6_lwt_encap tuninfo; |
47 | }; |
48 | |
49 | static const struct netlink_range_validation freq_range = { |
50 | .min = IOAM6_IPTUNNEL_FREQ_MIN, |
51 | .max = IOAM6_IPTUNNEL_FREQ_MAX, |
52 | }; |
53 | |
54 | static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) |
55 | { |
56 | return (struct ioam6_lwt *)lwt->data; |
57 | } |
58 | |
59 | static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) |
60 | { |
61 | return &ioam6_lwt_state(lwt)->tuninfo; |
62 | } |
63 | |
64 | static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt) |
65 | { |
66 | return &(ioam6_lwt_state(lwt)->tuninfo.traceh); |
67 | } |
68 | |
69 | static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { |
70 | [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), |
71 | [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range), |
72 | [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, |
73 | IOAM6_IPTUNNEL_MODE_MIN, |
74 | IOAM6_IPTUNNEL_MODE_MAX), |
75 | [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), |
76 | [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), |
77 | }; |
78 | |
79 | static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) |
80 | { |
81 | u32 fields; |
82 | |
83 | if (!trace->type_be32 || !trace->remlen || |
84 | trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || |
85 | trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | |
86 | trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | |
87 | trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | |
88 | trace->type.bit21) |
89 | return false; |
90 | |
91 | trace->nodelen = 0; |
92 | fields = be32_to_cpu(trace->type_be32); |
93 | |
94 | trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) |
95 | * (sizeof(__be32) / 4); |
96 | trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) |
97 | * (sizeof(__be64) / 4); |
98 | |
99 | return true; |
100 | } |
101 | |
102 | static int ioam6_build_state(struct net *net, struct nlattr *nla, |
103 | unsigned int family, const void *cfg, |
104 | struct lwtunnel_state **ts, |
105 | struct netlink_ext_ack *extack) |
106 | { |
107 | struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; |
108 | struct ioam6_lwt_encap *tuninfo; |
109 | struct ioam6_trace_hdr *trace; |
110 | struct lwtunnel_state *lwt; |
111 | struct ioam6_lwt *ilwt; |
112 | int len_aligned, err; |
113 | u32 freq_k, freq_n; |
114 | u8 mode; |
115 | |
116 | if (family != AF_INET6) |
117 | return -EINVAL; |
118 | |
119 | err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, |
120 | policy: ioam6_iptunnel_policy, extack); |
121 | if (err < 0) |
122 | return err; |
123 | |
124 | if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) || |
125 | (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) { |
126 | NL_SET_ERR_MSG(extack, "freq: missing parameter" ); |
127 | return -EINVAL; |
128 | } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) { |
129 | freq_k = IOAM6_IPTUNNEL_FREQ_MIN; |
130 | freq_n = IOAM6_IPTUNNEL_FREQ_MIN; |
131 | } else { |
132 | freq_k = nla_get_u32(nla: tb[IOAM6_IPTUNNEL_FREQ_K]); |
133 | freq_n = nla_get_u32(nla: tb[IOAM6_IPTUNNEL_FREQ_N]); |
134 | |
135 | if (freq_k > freq_n) { |
136 | NL_SET_ERR_MSG(extack, "freq: k > n is forbidden" ); |
137 | return -EINVAL; |
138 | } |
139 | } |
140 | |
141 | if (!tb[IOAM6_IPTUNNEL_MODE]) |
142 | mode = IOAM6_IPTUNNEL_MODE_INLINE; |
143 | else |
144 | mode = nla_get_u8(nla: tb[IOAM6_IPTUNNEL_MODE]); |
145 | |
146 | if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { |
147 | NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination" ); |
148 | return -EINVAL; |
149 | } |
150 | |
151 | if (!tb[IOAM6_IPTUNNEL_TRACE]) { |
152 | NL_SET_ERR_MSG(extack, "missing trace" ); |
153 | return -EINVAL; |
154 | } |
155 | |
156 | trace = nla_data(nla: tb[IOAM6_IPTUNNEL_TRACE]); |
157 | if (!ioam6_validate_trace_hdr(trace)) { |
158 | NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], |
159 | "invalid trace validation" ); |
160 | return -EINVAL; |
161 | } |
162 | |
163 | len_aligned = ALIGN(trace->remlen * 4, 8); |
164 | lwt = lwtunnel_state_alloc(hdr_len: sizeof(*ilwt) + len_aligned); |
165 | if (!lwt) |
166 | return -ENOMEM; |
167 | |
168 | ilwt = ioam6_lwt_state(lwt); |
169 | err = dst_cache_init(dst_cache: &ilwt->cache, GFP_ATOMIC); |
170 | if (err) { |
171 | kfree(objp: lwt); |
172 | return err; |
173 | } |
174 | |
175 | atomic_set(v: &ilwt->pkt_cnt, i: 0); |
176 | ilwt->freq.k = freq_k; |
177 | ilwt->freq.n = freq_n; |
178 | |
179 | ilwt->mode = mode; |
180 | if (tb[IOAM6_IPTUNNEL_DST]) |
181 | ilwt->tundst = nla_get_in6_addr(nla: tb[IOAM6_IPTUNNEL_DST]); |
182 | |
183 | tuninfo = ioam6_lwt_info(lwt); |
184 | tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; |
185 | tuninfo->pad[0] = IPV6_TLV_PADN; |
186 | tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; |
187 | tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; |
188 | tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) |
189 | + trace->remlen * 4; |
190 | |
191 | memcpy(&tuninfo->traceh, trace, sizeof(*trace)); |
192 | |
193 | if (len_aligned - trace->remlen * 4) { |
194 | tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; |
195 | tuninfo->traceh.data[trace->remlen * 4 + 1] = 2; |
196 | } |
197 | |
198 | lwt->type = LWTUNNEL_ENCAP_IOAM6; |
199 | lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; |
200 | |
201 | *ts = lwt; |
202 | |
203 | return 0; |
204 | } |
205 | |
206 | static int ioam6_do_fill(struct net *net, struct sk_buff *skb) |
207 | { |
208 | struct ioam6_trace_hdr *trace; |
209 | struct ioam6_namespace *ns; |
210 | |
211 | trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) |
212 | + sizeof(struct ipv6_hopopt_hdr) + 2 |
213 | + sizeof(struct ioam6_hdr)); |
214 | |
215 | ns = ioam6_namespace(net, id: trace->namespace_id); |
216 | if (ns) |
217 | ioam6_fill_trace_data(skb, ns, trace, is_input: false); |
218 | |
219 | return 0; |
220 | } |
221 | |
222 | static int ioam6_do_inline(struct net *net, struct sk_buff *skb, |
223 | struct ioam6_lwt_encap *tuninfo) |
224 | { |
225 | struct ipv6hdr *oldhdr, *hdr; |
226 | int hdrlen, err; |
227 | |
228 | hdrlen = (tuninfo->eh.hdrlen + 1) << 3; |
229 | |
230 | err = skb_cow_head(skb, headroom: hdrlen + skb->mac_len); |
231 | if (unlikely(err)) |
232 | return err; |
233 | |
234 | oldhdr = ipv6_hdr(skb); |
235 | skb_pull(skb, len: sizeof(*oldhdr)); |
236 | skb_postpull_rcsum(skb, start: skb_network_header(skb), len: sizeof(*oldhdr)); |
237 | |
238 | skb_push(skb, len: sizeof(*oldhdr) + hdrlen); |
239 | skb_reset_network_header(skb); |
240 | skb_mac_header_rebuild(skb); |
241 | |
242 | hdr = ipv6_hdr(skb); |
243 | memmove(hdr, oldhdr, sizeof(*oldhdr)); |
244 | tuninfo->eh.nexthdr = hdr->nexthdr; |
245 | |
246 | skb_set_transport_header(skb, offset: sizeof(*hdr)); |
247 | skb_postpush_rcsum(skb, start: hdr, len: sizeof(*hdr) + hdrlen); |
248 | |
249 | memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); |
250 | |
251 | hdr->nexthdr = NEXTHDR_HOP; |
252 | hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); |
253 | |
254 | return ioam6_do_fill(net, skb); |
255 | } |
256 | |
257 | static int ioam6_do_encap(struct net *net, struct sk_buff *skb, |
258 | struct ioam6_lwt_encap *tuninfo, |
259 | struct in6_addr *tundst) |
260 | { |
261 | struct dst_entry *dst = skb_dst(skb); |
262 | struct ipv6hdr *hdr, *inner_hdr; |
263 | int hdrlen, len, err; |
264 | |
265 | hdrlen = (tuninfo->eh.hdrlen + 1) << 3; |
266 | len = sizeof(*hdr) + hdrlen; |
267 | |
268 | err = skb_cow_head(skb, headroom: len + skb->mac_len); |
269 | if (unlikely(err)) |
270 | return err; |
271 | |
272 | inner_hdr = ipv6_hdr(skb); |
273 | |
274 | skb_push(skb, len); |
275 | skb_reset_network_header(skb); |
276 | skb_mac_header_rebuild(skb); |
277 | skb_set_transport_header(skb, offset: sizeof(*hdr)); |
278 | |
279 | tuninfo->eh.nexthdr = NEXTHDR_IPV6; |
280 | memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); |
281 | |
282 | hdr = ipv6_hdr(skb); |
283 | memcpy(hdr, inner_hdr, sizeof(*hdr)); |
284 | |
285 | hdr->nexthdr = NEXTHDR_HOP; |
286 | hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); |
287 | hdr->daddr = *tundst; |
288 | ipv6_dev_get_saddr(net, dev: dst->dev, daddr: &hdr->daddr, |
289 | IPV6_PREFER_SRC_PUBLIC, saddr: &hdr->saddr); |
290 | |
291 | skb_postpush_rcsum(skb, start: hdr, len); |
292 | |
293 | return ioam6_do_fill(net, skb); |
294 | } |
295 | |
296 | static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
297 | { |
298 | struct dst_entry *dst = skb_dst(skb); |
299 | struct in6_addr orig_daddr; |
300 | struct ioam6_lwt *ilwt; |
301 | int err = -EINVAL; |
302 | u32 pkt_cnt; |
303 | |
304 | if (skb->protocol != htons(ETH_P_IPV6)) |
305 | goto drop; |
306 | |
307 | ilwt = ioam6_lwt_state(lwt: dst->lwtstate); |
308 | |
309 | /* Check for insertion frequency (i.e., "k over n" insertions) */ |
310 | pkt_cnt = atomic_fetch_inc(v: &ilwt->pkt_cnt); |
311 | if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) |
312 | goto out; |
313 | |
314 | orig_daddr = ipv6_hdr(skb)->daddr; |
315 | |
316 | switch (ilwt->mode) { |
317 | case IOAM6_IPTUNNEL_MODE_INLINE: |
318 | do_inline: |
319 | /* Direct insertion - if there is no Hop-by-Hop yet */ |
320 | if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) |
321 | goto out; |
322 | |
323 | err = ioam6_do_inline(net, skb, tuninfo: &ilwt->tuninfo); |
324 | if (unlikely(err)) |
325 | goto drop; |
326 | |
327 | break; |
328 | case IOAM6_IPTUNNEL_MODE_ENCAP: |
329 | do_encap: |
330 | /* Encapsulation (ip6ip6) */ |
331 | err = ioam6_do_encap(net, skb, tuninfo: &ilwt->tuninfo, tundst: &ilwt->tundst); |
332 | if (unlikely(err)) |
333 | goto drop; |
334 | |
335 | break; |
336 | case IOAM6_IPTUNNEL_MODE_AUTO: |
337 | /* Automatic (RFC8200 compliant): |
338 | * - local packets -> INLINE mode |
339 | * - in-transit packets -> ENCAP mode |
340 | */ |
341 | if (!skb->dev) |
342 | goto do_inline; |
343 | |
344 | goto do_encap; |
345 | default: |
346 | goto drop; |
347 | } |
348 | |
349 | err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); |
350 | if (unlikely(err)) |
351 | goto drop; |
352 | |
353 | if (!ipv6_addr_equal(a1: &orig_daddr, a2: &ipv6_hdr(skb)->daddr)) { |
354 | preempt_disable(); |
355 | dst = dst_cache_get(dst_cache: &ilwt->cache); |
356 | preempt_enable(); |
357 | |
358 | if (unlikely(!dst)) { |
359 | struct ipv6hdr *hdr = ipv6_hdr(skb); |
360 | struct flowi6 fl6; |
361 | |
362 | memset(&fl6, 0, sizeof(fl6)); |
363 | fl6.daddr = hdr->daddr; |
364 | fl6.saddr = hdr->saddr; |
365 | fl6.flowlabel = ip6_flowinfo(hdr); |
366 | fl6.flowi6_mark = skb->mark; |
367 | fl6.flowi6_proto = hdr->nexthdr; |
368 | |
369 | dst = ip6_route_output(net, NULL, fl6: &fl6); |
370 | if (dst->error) { |
371 | err = dst->error; |
372 | dst_release(dst); |
373 | goto drop; |
374 | } |
375 | |
376 | preempt_disable(); |
377 | dst_cache_set_ip6(dst_cache: &ilwt->cache, dst, saddr: &fl6.saddr); |
378 | preempt_enable(); |
379 | } |
380 | |
381 | skb_dst_drop(skb); |
382 | skb_dst_set(skb, dst); |
383 | |
384 | return dst_output(net, sk, skb); |
385 | } |
386 | out: |
387 | return dst->lwtstate->orig_output(net, sk, skb); |
388 | drop: |
389 | kfree_skb(skb); |
390 | return err; |
391 | } |
392 | |
393 | static void ioam6_destroy_state(struct lwtunnel_state *lwt) |
394 | { |
395 | dst_cache_destroy(dst_cache: &ioam6_lwt_state(lwt)->cache); |
396 | } |
397 | |
398 | static int ioam6_fill_encap_info(struct sk_buff *skb, |
399 | struct lwtunnel_state *lwtstate) |
400 | { |
401 | struct ioam6_lwt *ilwt = ioam6_lwt_state(lwt: lwtstate); |
402 | int err; |
403 | |
404 | err = nla_put_u32(skb, attrtype: IOAM6_IPTUNNEL_FREQ_K, value: ilwt->freq.k); |
405 | if (err) |
406 | goto ret; |
407 | |
408 | err = nla_put_u32(skb, attrtype: IOAM6_IPTUNNEL_FREQ_N, value: ilwt->freq.n); |
409 | if (err) |
410 | goto ret; |
411 | |
412 | err = nla_put_u8(skb, attrtype: IOAM6_IPTUNNEL_MODE, value: ilwt->mode); |
413 | if (err) |
414 | goto ret; |
415 | |
416 | if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { |
417 | err = nla_put_in6_addr(skb, attrtype: IOAM6_IPTUNNEL_DST, addr: &ilwt->tundst); |
418 | if (err) |
419 | goto ret; |
420 | } |
421 | |
422 | err = nla_put(skb, attrtype: IOAM6_IPTUNNEL_TRACE, attrlen: sizeof(ilwt->tuninfo.traceh), |
423 | data: &ilwt->tuninfo.traceh); |
424 | ret: |
425 | return err; |
426 | } |
427 | |
428 | static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) |
429 | { |
430 | struct ioam6_lwt *ilwt = ioam6_lwt_state(lwt: lwtstate); |
431 | int nlsize; |
432 | |
433 | nlsize = nla_total_size(payload: sizeof(ilwt->freq.k)) + |
434 | nla_total_size(payload: sizeof(ilwt->freq.n)) + |
435 | nla_total_size(payload: sizeof(ilwt->mode)) + |
436 | nla_total_size(payload: sizeof(ilwt->tuninfo.traceh)); |
437 | |
438 | if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) |
439 | nlsize += nla_total_size(payload: sizeof(ilwt->tundst)); |
440 | |
441 | return nlsize; |
442 | } |
443 | |
444 | static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) |
445 | { |
446 | struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(lwt: a); |
447 | struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(lwt: b); |
448 | struct ioam6_lwt *ilwt_a = ioam6_lwt_state(lwt: a); |
449 | struct ioam6_lwt *ilwt_b = ioam6_lwt_state(lwt: b); |
450 | |
451 | return (ilwt_a->freq.k != ilwt_b->freq.k || |
452 | ilwt_a->freq.n != ilwt_b->freq.n || |
453 | ilwt_a->mode != ilwt_b->mode || |
454 | (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && |
455 | !ipv6_addr_equal(a1: &ilwt_a->tundst, a2: &ilwt_b->tundst)) || |
456 | trace_a->namespace_id != trace_b->namespace_id); |
457 | } |
458 | |
459 | static const struct lwtunnel_encap_ops ioam6_iptun_ops = { |
460 | .build_state = ioam6_build_state, |
461 | .destroy_state = ioam6_destroy_state, |
462 | .output = ioam6_output, |
463 | .fill_encap = ioam6_fill_encap_info, |
464 | .get_encap_size = ioam6_encap_nlsize, |
465 | .cmp_encap = ioam6_encap_cmp, |
466 | .owner = THIS_MODULE, |
467 | }; |
468 | |
469 | int __init ioam6_iptunnel_init(void) |
470 | { |
471 | return lwtunnel_encap_add_ops(op: &ioam6_iptun_ops, num: LWTUNNEL_ENCAP_IOAM6); |
472 | } |
473 | |
474 | void ioam6_iptunnel_exit(void) |
475 | { |
476 | lwtunnel_encap_del_ops(op: &ioam6_iptun_ops, num: LWTUNNEL_ENCAP_IOAM6); |
477 | } |
478 | |