forked from cilium/cilium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbpf_overlay.c
386 lines (334 loc) · 9.59 KB
/
bpf_overlay.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2016-2020 Authors of Cilium */
#include <bpf/ctx/skb.h>
#include <bpf/api.h>
#include <node_config.h>
#include <netdev_config.h>
#define IS_BPF_OVERLAY 1
/* Controls the inclusion of the CILIUM_CALL_HANDLE_ICMP6_NS section in the
* bpf_lxc object file.
*/
#define SKIP_ICMPV6_NS_HANDLING
/* Controls the inclusion of the CILIUM_CALL_SEND_ICMP6_ECHO_REPLY section in
* the bpf_lxc object file.
*/
#define SKIP_ICMPV6_ECHO_HANDLING
#include "lib/tailcall.h"
#include "lib/common.h"
#include "lib/edt.h"
#include "lib/maps.h"
#include "lib/ipv6.h"
#include "lib/eth.h"
#include "lib/dbg.h"
#include "lib/trace.h"
#include "lib/l3.h"
#include "lib/drop.h"
#include "lib/identity.h"
#include "lib/nodeport.h"
#ifdef ENABLE_IPV6
static __always_inline int handle_ipv6(struct __ctx_buff *ctx,
__u32 *identity)
{
int ret, l3_off = ETH_HLEN, hdrlen;
void *data_end, *data;
struct ipv6hdr *ip6;
struct bpf_tunnel_key key = {};
struct endpoint_info *ep;
bool decrypted;
/* verifier workaround (dereference of modified ctx ptr) */
if (!revalidate_data_pull(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
#ifdef ENABLE_NODEPORT
if (!bpf_skip_nodeport(ctx)) {
ret = nodeport_lb6(ctx, *identity);
if (ret < 0)
return ret;
}
#endif
ret = encap_remap_v6_host_address(ctx, false);
if (unlikely(ret < 0))
return ret;
if (!revalidate_data(ctx, &data, &data_end, &ip6))
return DROP_INVALID;
decrypted = ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT);
if (decrypted) {
*identity = key.tunnel_id = get_identity(ctx);
} else {
if (unlikely(ctx_get_tunnel_key(ctx, &key, sizeof(key), 0) < 0))
return DROP_NO_TUNNEL_KEY;
*identity = key.tunnel_id;
/* Any node encapsulating will map any HOST_ID source to be
* presented as REMOTE_NODE_ID, therefore any attempt to signal
* HOST_ID as source from a remote node can be dropped.
*/
if (*identity == HOST_ID)
return DROP_INVALID_IDENTITY;
}
cilium_dbg(ctx, DBG_DECAP, key.tunnel_id, key.tunnel_label);
#ifdef ENABLE_IPSEC
if (!decrypted) {
/* IPSec is not currently enforce (feature coming soon)
* so for now just handle normally
*/
if (ip6->nexthdr != IPPROTO_ESP) {
update_metrics(ctx_full_len(ctx), METRIC_INGRESS,
REASON_PLAINTEXT);
goto not_esp;
}
/* Decrypt "key" is determined by SPI */
ctx->mark = MARK_MAGIC_DECRYPT;
set_identity_mark(ctx, *identity);
/* To IPSec stack on cilium_vxlan we are going to pass
* this up the stack but eth_type_trans has already labeled
* this as an OTHERHOST type packet. To avoid being dropped
* by IP stack before IPSec can be processed mark as a HOST
* packet.
*/
ctx_change_type(ctx, PACKET_HOST);
return CTX_ACT_OK;
}
ctx->mark = 0;
not_esp:
#endif
/* Lookup IPv6 address in list of local endpoints */
ep = lookup_ip6_endpoint(ip6);
if (ep) {
__u8 nexthdr;
/* Let through packets to the node-ip so they are processed by
* the local ip stack.
*/
if (ep->flags & ENDPOINT_F_HOST)
goto to_host;
nexthdr = ip6->nexthdr;
hdrlen = ipv6_hdrlen(ctx, l3_off, &nexthdr);
if (hdrlen < 0)
return hdrlen;
return ipv6_local_delivery(ctx, l3_off, *identity, ep,
METRIC_INGRESS, false);
}
/* A packet entering the node from the tunnel and not going to a local
* endpoint has to be going to the local host.
*/
to_host:
#ifdef HOST_IFINDEX
if (1) {
union macaddr host_mac = HOST_IFINDEX_MAC;
union macaddr router_mac = NODE_MAC;
ret = ipv6_l3(ctx, ETH_HLEN, (__u8 *)&router_mac.addr,
(__u8 *)&host_mac.addr, METRIC_INGRESS);
if (ret != CTX_ACT_OK)
return ret;
cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, HOST_IFINDEX);
return redirect(HOST_IFINDEX, 0);
}
#else
return CTX_ACT_OK;
#endif
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV6_FROM_LXC)
int tail_handle_ipv6(struct __ctx_buff *ctx)
{
__u32 src_identity = 0;
int ret = handle_ipv6(ctx, &src_identity);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, src_identity, ret,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
static __always_inline int handle_ipv4(struct __ctx_buff *ctx, __u32 *identity)
{
void *data_end, *data;
struct iphdr *ip4;
struct endpoint_info *ep;
struct bpf_tunnel_key key = {};
bool decrypted;
/* verifier workaround (dereference of modified ctx ptr) */
if (!revalidate_data_pull(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
/* If IPv4 fragmentation is disabled
* AND a IPv4 fragmented packet is received,
* then drop the packet.
*/
#ifndef ENABLE_IPV4_FRAGMENTS
if (ipv4_is_fragment(ip4))
return DROP_FRAG_NOSUPPORT;
#endif
#ifdef ENABLE_NODEPORT
if (!bpf_skip_nodeport(ctx)) {
int ret = nodeport_lb4(ctx, *identity);
if (ret < 0)
return ret;
}
#endif
if (!revalidate_data(ctx, &data, &data_end, &ip4))
return DROP_INVALID;
decrypted = ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT);
/* If packets are decrypted the key has already been pushed into metadata. */
if (decrypted) {
*identity = key.tunnel_id = get_identity(ctx);
} else {
if (unlikely(ctx_get_tunnel_key(ctx, &key, sizeof(key), 0) < 0))
return DROP_NO_TUNNEL_KEY;
*identity = key.tunnel_id;
if (*identity == HOST_ID)
return DROP_INVALID_IDENTITY;
}
cilium_dbg(ctx, DBG_DECAP, key.tunnel_id, key.tunnel_label);
#ifdef ENABLE_IPSEC
if (!decrypted) {
/* IPSec is not currently enforce (feature coming soon)
* so for now just handle normally
*/
if (ip4->protocol != IPPROTO_ESP) {
update_metrics(ctx_full_len(ctx), METRIC_INGRESS,
REASON_PLAINTEXT);
goto not_esp;
}
ctx->mark = MARK_MAGIC_DECRYPT;
set_identity_mark(ctx, *identity);
/* To IPSec stack on cilium_vxlan we are going to pass
* this up the stack but eth_type_trans has already labeled
* this as an OTHERHOST type packet. To avoid being dropped
* by IP stack before IPSec can be processed mark as a HOST
* packet.
*/
ctx_change_type(ctx, PACKET_HOST);
return CTX_ACT_OK;
}
ctx->mark = 0;
not_esp:
#endif
/* Lookup IPv4 address in list of local endpoints */
ep = lookup_ip4_endpoint(ip4);
if (ep) {
/* Let through packets to the node-ip so they are processed by
* the local ip stack.
*/
if (ep->flags & ENDPOINT_F_HOST)
goto to_host;
return ipv4_local_delivery(ctx, ETH_HLEN, *identity, ip4, ep,
METRIC_INGRESS, false);
}
/* A packet entering the node from the tunnel and not going to a local
* endpoint has to be going to the local host.
*/
to_host:
#ifdef HOST_IFINDEX
if (1) {
union macaddr host_mac = HOST_IFINDEX_MAC;
union macaddr router_mac = NODE_MAC;
int ret;
ret = ipv4_l3(ctx, ETH_HLEN, (__u8 *)&router_mac.addr,
(__u8 *)&host_mac.addr, ip4);
if (ret != CTX_ACT_OK)
return ret;
cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, HOST_IFINDEX);
return redirect(HOST_IFINDEX, 0);
}
#else
return CTX_ACT_OK;
#endif
}
__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_FROM_LXC)
int tail_handle_ipv4(struct __ctx_buff *ctx)
{
__u32 src_identity = 0;
int ret = handle_ipv4(ctx, &src_identity);
if (IS_ERR(ret))
return send_drop_notify_error(ctx, src_identity, ret,
CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
#endif /* ENABLE_IPV4 */
/* Attached to the ingress of cilium_vxlan/cilium_geneve to execute on packets
* entering the node via the tunnel.
*/
__section("from-overlay")
int from_overlay(struct __ctx_buff *ctx)
{
__u16 proto;
int ret;
bpf_clear_meta(ctx);
bpf_skip_nodeport_clear(ctx);
if (!validate_ethertype(ctx, &proto)) {
/* Pass unknown traffic to the stack */
ret = CTX_ACT_OK;
goto out;
}
#ifdef ENABLE_IPSEC
if ((ctx->mark & MARK_MAGIC_HOST_MASK) == MARK_MAGIC_DECRYPT) {
send_trace_notify(ctx, TRACE_FROM_OVERLAY, get_identity(ctx), 0, 0,
ctx->ingress_ifindex,
TRACE_REASON_ENCRYPTED, TRACE_PAYLOAD_LEN);
} else
#endif
{
send_trace_notify(ctx, TRACE_FROM_OVERLAY, 0, 0, 0,
ctx->ingress_ifindex, 0, TRACE_PAYLOAD_LEN);
}
switch (proto) {
case bpf_htons(ETH_P_IPV6):
#ifdef ENABLE_IPV6
ep_tail_call(ctx, CILIUM_CALL_IPV6_FROM_LXC);
ret = DROP_MISSED_TAIL_CALL;
#else
ret = DROP_UNKNOWN_L3;
#endif
break;
case bpf_htons(ETH_P_IP):
#ifdef ENABLE_IPV4
ep_tail_call(ctx, CILIUM_CALL_IPV4_FROM_LXC);
ret = DROP_MISSED_TAIL_CALL;
#else
ret = DROP_UNKNOWN_L3;
#endif
break;
default:
/* Pass unknown traffic to the stack */
ret = CTX_ACT_OK;
}
out:
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_INGRESS);
return ret;
}
/* Attached to the egress of cilium_vxlan/cilium_geneve to execute on packets
* leaving the node via the tunnel.
*/
__section("to-overlay")
int to_overlay(struct __ctx_buff *ctx)
{
int ret;
ret = encap_remap_v6_host_address(ctx, true);
if (unlikely(ret < 0))
goto out;
#ifdef ENABLE_BANDWIDTH_MANAGER
/* In tunneling mode, we should do this as close as possible to the
* phys dev where FQ runs, but the issue is that the aggregate state
* (in queue_mapping) is overridden on tunnel xmit. Hence set the
* timestamp already here. The tunnel dev has noqueue qdisc, so as
* tradeoff it's close enough.
*/
ret = edt_sched_departure(ctx);
/* No send_drop_notify_error() here given we're rate-limiting. */
if (ret == CTX_ACT_DROP) {
update_metrics(ctx_full_len(ctx), METRIC_EGRESS,
-DROP_EDT_HORIZON);
return CTX_ACT_DROP;
}
#endif
#ifdef ENABLE_NODEPORT
if ((ctx->mark & MARK_MAGIC_SNAT_DONE) == MARK_MAGIC_SNAT_DONE) {
ret = CTX_ACT_OK;
goto out;
}
ret = handle_nat_fwd(ctx);
#endif
out:
if (IS_ERR(ret))
return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
return ret;
}
BPF_LICENSE("GPL");