1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * IPv6 IOAM Lightweight Tunnel implementation
4 *
5 * Author:
6 * Justin Iurman <justin.iurman@uliege.be>
7 */
8
9 #include <linux/kernel.h>
10 #include <linux/skbuff.h>
11 #include <linux/net.h>
12 #include <linux/in6.h>
13 #include <linux/ioam6.h>
14 #include <linux/ioam6_iptunnel.h>
15 #include <net/dst.h>
16 #include <net/sock.h>
17 #include <net/lwtunnel.h>
18 #include <net/ioam6.h>
19 #include <net/netlink.h>
20 #include <net/ipv6.h>
21 #include <net/dst_cache.h>
22 #include <net/ip6_route.h>
23 #include <net/addrconf.h>
24
25 #define IOAM6_MASK_SHORT_FIELDS 0xff100000
26 #define IOAM6_MASK_WIDE_FIELDS 0xe00000
27
28 struct ioam6_lwt_encap {
29 struct ipv6_hopopt_hdr eh;
30 u8 pad[2]; /* 2-octet padding for 4n-alignment */
31 struct ioam6_hdr ioamh;
32 struct ioam6_trace_hdr traceh;
33 } __packed;
34
35 struct ioam6_lwt {
36 struct dst_cache cache;
37 u8 mode;
38 struct in6_addr tundst;
39 struct ioam6_lwt_encap tuninfo;
40 };
41
ioam6_lwt_state(struct lwtunnel_state * lwt)42 static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
43 {
44 return (struct ioam6_lwt *)lwt->data;
45 }
46
ioam6_lwt_info(struct lwtunnel_state * lwt)47 static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
48 {
49 return &ioam6_lwt_state(lwt)->tuninfo;
50 }
51
ioam6_lwt_trace(struct lwtunnel_state * lwt)52 static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
53 {
54 return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
55 }
56
57 static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
58 [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
59 IOAM6_IPTUNNEL_MODE_MIN,
60 IOAM6_IPTUNNEL_MODE_MAX),
61 [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
62 [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)),
63 };
64
ioam6_validate_trace_hdr(struct ioam6_trace_hdr * trace)65 static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
66 {
67 u32 fields;
68
69 if (!trace->type_be32 || !trace->remlen ||
70 trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
71 trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
72 trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
73 trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
74 trace->type.bit21)
75 return false;
76
77 trace->nodelen = 0;
78 fields = be32_to_cpu(trace->type_be32);
79
80 trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
81 * (sizeof(__be32) / 4);
82 trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
83 * (sizeof(__be64) / 4);
84
85 return true;
86 }
87
ioam6_build_state(struct net * net,struct nlattr * nla,unsigned int family,const void * cfg,struct lwtunnel_state ** ts,struct netlink_ext_ack * extack)88 static int ioam6_build_state(struct net *net, struct nlattr *nla,
89 unsigned int family, const void *cfg,
90 struct lwtunnel_state **ts,
91 struct netlink_ext_ack *extack)
92 {
93 struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
94 struct ioam6_lwt_encap *tuninfo;
95 struct ioam6_trace_hdr *trace;
96 struct lwtunnel_state *lwt;
97 struct ioam6_lwt *ilwt;
98 int len_aligned, err;
99 u8 mode;
100
101 if (family != AF_INET6)
102 return -EINVAL;
103
104 err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
105 ioam6_iptunnel_policy, extack);
106 if (err < 0)
107 return err;
108
109 if (!tb[IOAM6_IPTUNNEL_MODE])
110 mode = IOAM6_IPTUNNEL_MODE_INLINE;
111 else
112 mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]);
113
114 if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
115 NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
116 return -EINVAL;
117 }
118
119 if (!tb[IOAM6_IPTUNNEL_TRACE]) {
120 NL_SET_ERR_MSG(extack, "missing trace");
121 return -EINVAL;
122 }
123
124 trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
125 if (!ioam6_validate_trace_hdr(trace)) {
126 NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
127 "invalid trace validation");
128 return -EINVAL;
129 }
130
131 len_aligned = ALIGN(trace->remlen * 4, 8);
132 lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
133 if (!lwt)
134 return -ENOMEM;
135
136 ilwt = ioam6_lwt_state(lwt);
137 err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
138 if (err) {
139 kfree(lwt);
140 return err;
141 }
142
143 ilwt->mode = mode;
144 if (tb[IOAM6_IPTUNNEL_DST])
145 ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
146
147 tuninfo = ioam6_lwt_info(lwt);
148 tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
149 tuninfo->pad[0] = IPV6_TLV_PADN;
150 tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
151 tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
152 tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
153 + trace->remlen * 4;
154
155 memcpy(&tuninfo->traceh, trace, sizeof(*trace));
156
157 if (len_aligned - trace->remlen * 4) {
158 tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
159 tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
160 }
161
162 lwt->type = LWTUNNEL_ENCAP_IOAM6;
163 lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
164
165 *ts = lwt;
166
167 return 0;
168 }
169
ioam6_do_fill(struct net * net,struct sk_buff * skb)170 static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
171 {
172 struct ioam6_trace_hdr *trace;
173 struct ioam6_namespace *ns;
174
175 trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
176 + sizeof(struct ipv6_hopopt_hdr) + 2
177 + sizeof(struct ioam6_hdr));
178
179 ns = ioam6_namespace(net, trace->namespace_id);
180 if (ns)
181 ioam6_fill_trace_data(skb, ns, trace, false);
182
183 return 0;
184 }
185
ioam6_do_inline(struct net * net,struct sk_buff * skb,struct ioam6_lwt_encap * tuninfo)186 static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
187 struct ioam6_lwt_encap *tuninfo)
188 {
189 struct ipv6hdr *oldhdr, *hdr;
190 int hdrlen, err;
191
192 hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
193
194 err = skb_cow_head(skb, hdrlen + skb->mac_len);
195 if (unlikely(err))
196 return err;
197
198 oldhdr = ipv6_hdr(skb);
199 skb_pull(skb, sizeof(*oldhdr));
200 skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
201
202 skb_push(skb, sizeof(*oldhdr) + hdrlen);
203 skb_reset_network_header(skb);
204 skb_mac_header_rebuild(skb);
205
206 hdr = ipv6_hdr(skb);
207 memmove(hdr, oldhdr, sizeof(*oldhdr));
208 tuninfo->eh.nexthdr = hdr->nexthdr;
209
210 skb_set_transport_header(skb, sizeof(*hdr));
211 skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
212
213 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
214
215 hdr->nexthdr = NEXTHDR_HOP;
216 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
217
218 return ioam6_do_fill(net, skb);
219 }
220
ioam6_do_encap(struct net * net,struct sk_buff * skb,struct ioam6_lwt_encap * tuninfo,struct in6_addr * tundst)221 static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
222 struct ioam6_lwt_encap *tuninfo,
223 struct in6_addr *tundst)
224 {
225 struct dst_entry *dst = skb_dst(skb);
226 struct ipv6hdr *hdr, *inner_hdr;
227 int hdrlen, len, err;
228
229 hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
230 len = sizeof(*hdr) + hdrlen;
231
232 err = skb_cow_head(skb, len + skb->mac_len);
233 if (unlikely(err))
234 return err;
235
236 inner_hdr = ipv6_hdr(skb);
237
238 skb_push(skb, len);
239 skb_reset_network_header(skb);
240 skb_mac_header_rebuild(skb);
241 skb_set_transport_header(skb, sizeof(*hdr));
242
243 tuninfo->eh.nexthdr = NEXTHDR_IPV6;
244 memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
245
246 hdr = ipv6_hdr(skb);
247 memcpy(hdr, inner_hdr, sizeof(*hdr));
248
249 hdr->nexthdr = NEXTHDR_HOP;
250 hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
251 hdr->daddr = *tundst;
252 ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr,
253 IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
254
255 skb_postpush_rcsum(skb, hdr, len);
256
257 return ioam6_do_fill(net, skb);
258 }
259
ioam6_output(struct net * net,struct sock * sk,struct sk_buff * skb)260 static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
261 {
262 struct dst_entry *dst = skb_dst(skb);
263 struct in6_addr orig_daddr;
264 struct ioam6_lwt *ilwt;
265 int err = -EINVAL;
266
267 if (skb->protocol != htons(ETH_P_IPV6))
268 goto drop;
269
270 ilwt = ioam6_lwt_state(dst->lwtstate);
271 orig_daddr = ipv6_hdr(skb)->daddr;
272
273 switch (ilwt->mode) {
274 case IOAM6_IPTUNNEL_MODE_INLINE:
275 do_inline:
276 /* Direct insertion - if there is no Hop-by-Hop yet */
277 if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
278 goto out;
279
280 err = ioam6_do_inline(net, skb, &ilwt->tuninfo);
281 if (unlikely(err))
282 goto drop;
283
284 break;
285 case IOAM6_IPTUNNEL_MODE_ENCAP:
286 do_encap:
287 /* Encapsulation (ip6ip6) */
288 err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst);
289 if (unlikely(err))
290 goto drop;
291
292 break;
293 case IOAM6_IPTUNNEL_MODE_AUTO:
294 /* Automatic (RFC8200 compliant):
295 * - local packets -> INLINE mode
296 * - in-transit packets -> ENCAP mode
297 */
298 if (!skb->dev)
299 goto do_inline;
300
301 goto do_encap;
302 default:
303 goto drop;
304 }
305
306 err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
307 if (unlikely(err))
308 goto drop;
309
310 if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
311 preempt_disable();
312 dst = dst_cache_get(&ilwt->cache);
313 preempt_enable();
314
315 if (unlikely(!dst)) {
316 struct ipv6hdr *hdr = ipv6_hdr(skb);
317 struct flowi6 fl6;
318
319 memset(&fl6, 0, sizeof(fl6));
320 fl6.daddr = hdr->daddr;
321 fl6.saddr = hdr->saddr;
322 fl6.flowlabel = ip6_flowinfo(hdr);
323 fl6.flowi6_mark = skb->mark;
324 fl6.flowi6_proto = hdr->nexthdr;
325
326 dst = ip6_route_output(net, NULL, &fl6);
327 if (dst->error) {
328 err = dst->error;
329 dst_release(dst);
330 goto drop;
331 }
332
333 preempt_disable();
334 dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
335 preempt_enable();
336 }
337
338 skb_dst_drop(skb);
339 skb_dst_set(skb, dst);
340
341 return dst_output(net, sk, skb);
342 }
343 out:
344 return dst->lwtstate->orig_output(net, sk, skb);
345 drop:
346 kfree_skb(skb);
347 return err;
348 }
349
ioam6_destroy_state(struct lwtunnel_state * lwt)350 static void ioam6_destroy_state(struct lwtunnel_state *lwt)
351 {
352 dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
353 }
354
ioam6_fill_encap_info(struct sk_buff * skb,struct lwtunnel_state * lwtstate)355 static int ioam6_fill_encap_info(struct sk_buff *skb,
356 struct lwtunnel_state *lwtstate)
357 {
358 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
359 int err;
360
361 err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
362 if (err)
363 goto ret;
364
365 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
366 err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
367 if (err)
368 goto ret;
369 }
370
371 err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
372 &ilwt->tuninfo.traceh);
373 ret:
374 return err;
375 }
376
ioam6_encap_nlsize(struct lwtunnel_state * lwtstate)377 static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
378 {
379 struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
380 int nlsize;
381
382 nlsize = nla_total_size(sizeof(ilwt->mode)) +
383 nla_total_size(sizeof(ilwt->tuninfo.traceh));
384
385 if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE)
386 nlsize += nla_total_size(sizeof(ilwt->tundst));
387
388 return nlsize;
389 }
390
ioam6_encap_cmp(struct lwtunnel_state * a,struct lwtunnel_state * b)391 static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
392 {
393 struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
394 struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
395 struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
396 struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
397
398 return (ilwt_a->mode != ilwt_b->mode ||
399 (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
400 !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
401 trace_a->namespace_id != trace_b->namespace_id);
402 }
403
404 static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
405 .build_state = ioam6_build_state,
406 .destroy_state = ioam6_destroy_state,
407 .output = ioam6_output,
408 .fill_encap = ioam6_fill_encap_info,
409 .get_encap_size = ioam6_encap_nlsize,
410 .cmp_encap = ioam6_encap_cmp,
411 .owner = THIS_MODULE,
412 };
413
ioam6_iptunnel_init(void)414 int __init ioam6_iptunnel_init(void)
415 {
416 return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
417 }
418
ioam6_iptunnel_exit(void)419 void ioam6_iptunnel_exit(void)
420 {
421 lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
422 }
423