piggyback/piggyback.bpf.c
2026-04-04 18:03:43 +02:00

286 lines
9.2 KiB
C

// SPDX-License-Identifier: GPL-2.0
//
// piggyback.bpf.c — TC ingress + sk_lookup eBPF programs
//
// TC ingress: intercepts TCP packets on configured ports. If payload starts
// with MAGIC bytes (or partial match in progress), steals the packet
// (TC_ACT_STOLEN) and notifies userspace, or passes through (TC_ACT_OK).
//
// sk_lookup: steers new connections flagged by TC into the daemon's
// SO_REUSEPORT socket instead of the application socket.
//
// Supports IPv4 + IPv6. Handles magic split across TCP segments via
// per-connection LRU state map.
#include <linux/bpf.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
// AF_ constants — cannot include glibc headers in BPF programs
#define AF_INET 2
#define AF_INET6 10
// ── Configuration ─────────────────────────────────────────────────────────────
#define PORTS { 80, 8080 }
#define PORTS_N 2
#define MAGIC "\xDE\xAD\xC0\xDE\xCA\xFE"
#define MAGIC_LEN 6
// Signed header: 16 bytes fields + 64 bytes Ed25519 sig = 80 total
// eBPF only checks structural validity; full Ed25519 verify is in userspace.
#define HEADER_LEN 80
// ── Shared types ──────────────────────────────────────────────────────────────
struct conn_key {
__u8 src_ip[16]; // IPv4: first 4 bytes; IPv6: all 16
__u16 src_port;
__u8 is_ipv6;
__u8 pad;
};
struct event {
__u8 src_ip[16];
__u16 src_port;
__u8 is_ipv6;
__u8 flags;
__u32 seq;
__u32 ack_seq;
__u8 header[HEADER_LEN];
__u8 header_valid;
};
// ── Maps ──────────────────────────────────────────────────────────────────────
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 24);
} events SEC(".maps");
// Per-connection magic match state: bytes matched so far (0..MAGIC_LEN)
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 4096);
__type(key, struct conn_key);
__type(value, __u8);
} conn_state SEC(".maps");
// Connections awaiting sk_lookup steering: set by TC, cleared by sk_lookup
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 4096);
__type(key, struct conn_key);
__type(value, __u8);
} pending SEC(".maps");
// Daemon's SO_REUSEPORT socket — populated by userspace after bind
struct {
__uint(type, BPF_MAP_TYPE_SOCKMAP);
__uint(max_entries, 1);
__type(key, __u32);
__type(value, __u32);
} daemon_sock SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 4);
__type(key, __u32);
__type(value, __u64);
} stats SEC(".maps");
#define STAT_TOTAL 0
#define STAT_MAGIC 1
#define STAT_PASSED 2
#define STAT_PARTIAL 3
static __always_inline void inc_stat(__u32 idx) {
__u64 *v = bpf_map_lookup_elem(&stats, &idx);
if (v) __sync_fetch_and_add(v, 1);
}
// ── Helpers ───────────────────────────────────────────────────────────────────
static __always_inline int port_watched(__u16 port_be) {
__u16 ports[] = PORTS;
__u16 p = bpf_ntohs(port_be);
if (PORTS_N > 0 && p == ports[0]) return 1;
if (PORTS_N > 1 && p == ports[1]) return 1;
if (PORTS_N > 2 && p == ports[2]) return 1;
if (PORTS_N > 3 && p == ports[3]) return 1;
return 0;
}
static __always_inline __u8 magic_at(__u8 idx) {
const __u8 magic[] = MAGIC;
if (idx < MAGIC_LEN) return magic[idx];
return 0;
}
// ── TC ingress ────────────────────────────────────────────────────────────────
SEC("tc")
int piggyback_ingress(struct __sk_buff *skb) {
void *data = (void *)(long)skb->data;
void *data_end = (void *)(long)skb->data_end;
inc_stat(STAT_TOTAL);
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end) return TC_ACT_OK;
__u8 is_ipv6 = 0;
__u8 src_ip[16] = {};
void *transport;
if (eth->h_proto == bpf_htons(ETH_P_IP)) {
struct iphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end) return TC_ACT_OK;
if (ip->protocol != IPPROTO_TCP) return TC_ACT_OK;
__builtin_memcpy(src_ip, &ip->saddr, 4);
transport = (void *)ip + ip->ihl * 4;
} else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
struct ipv6hdr *ip6 = (void *)(eth + 1);
if ((void *)(ip6 + 1) > data_end) return TC_ACT_OK;
if (ip6->nexthdr != IPPROTO_TCP) return TC_ACT_OK;
__builtin_memcpy(src_ip, &ip6->saddr, 16);
is_ipv6 = 1;
transport = (void *)(ip6 + 1);
} else {
return TC_ACT_OK;
}
struct tcphdr *tcp = transport;
if ((void *)(tcp + 1) > data_end) return TC_ACT_OK;
if (!port_watched(tcp->dest)) return TC_ACT_OK;
__u32 tcp_hlen = tcp->doff * 4;
__u8 *payload = (__u8 *)tcp + tcp_hlen;
if ((void *)payload > data_end) return TC_ACT_OK;
__u32 payload_len = (__u8 *)data_end - payload;
struct conn_key ck = {};
__builtin_memcpy(ck.src_ip, src_ip, 16);
ck.src_port = tcp->source;
ck.is_ipv6 = is_ipv6;
// ── Multi-packet state machine ────────────────────────────────────────────
__u8 *state_p = bpf_map_lookup_elem(&conn_state, &ck);
__u8 matched = state_p ? *state_p : 0;
// Unrolled byte walk — verifier requires bounded, known iterations
#define TRY_BYTE(i) \
if ((void *)(payload + (i) + 1) <= data_end) { \
__u8 b = payload[(i)]; \
if (b == magic_at(matched)) { \
matched++; \
} else { \
matched = (b == magic_at(0)) ? 1 : 0; \
} \
}
TRY_BYTE(0)
TRY_BYTE(1)
TRY_BYTE(2)
TRY_BYTE(3)
TRY_BYTE(4)
TRY_BYTE(5)
#undef TRY_BYTE
if (matched < MAGIC_LEN) {
if (matched > 0) {
bpf_map_update_elem(&conn_state, &ck, &matched, BPF_ANY);
inc_stat(STAT_PARTIAL);
} else {
bpf_map_delete_elem(&conn_state, &ck);
}
goto pass;
}
// ── Magic matched ─────────────────────────────────────────────────────────
bpf_map_delete_elem(&conn_state, &ck);
{
struct event *e = bpf_ringbuf_reserve(&events, sizeof(*e), 0);
if (!e) return TC_ACT_OK; // ring buffer full — pass, don't drop silently
__builtin_memcpy(e->src_ip, src_ip, 16);
e->src_port = tcp->source;
e->is_ipv6 = is_ipv6;
e->flags = ((__u8 *)tcp)[13];
e->seq = tcp->seq;
e->ack_seq = tcp->ack_seq;
e->header_valid = 0;
// Capture signed header bytes after magic for userspace Ed25519 verify
if (payload_len >= MAGIC_LEN + HEADER_LEN) {
__u32 hdr_offset = (__u8 *)payload - (__u8 *)data + MAGIC_LEN;
if (bpf_skb_load_bytes(skb, hdr_offset, e->header, HEADER_LEN) == 0)
e->header_valid = 1;
}
// Mark connection pending for sk_lookup steering
__u8 one = 1;
bpf_map_update_elem(&pending, &ck, &one, BPF_ANY);
bpf_ringbuf_submit(e, 0);
inc_stat(STAT_MAGIC);
return TC_ACT_STOLEN;
}
pass:
inc_stat(STAT_PASSED);
return TC_ACT_OK;
}
// ── sk_lookup ─────────────────────────────────────────────────────────────────
//
// Runs when kernel looks up a socket for an incoming connection.
// If the connection is in pending map (flagged by TC), redirect to daemon socket.
SEC("sk_lookup")
int piggyback_lookup(struct bpf_sk_lookup *ctx) {
struct conn_key ck = {};
if (ctx->family == AF_INET) {
__builtin_memcpy(ck.src_ip, &ctx->remote_ip4, 4);
ck.is_ipv6 = 0;
} else if (ctx->family == AF_INET6) {
__builtin_memcpy(ck.src_ip, ctx->remote_ip6, 16);
ck.is_ipv6 = 1;
} else {
return SK_PASS;
}
// remote_port in bpf_sk_lookup is __be16 (network byte order)
ck.src_port = ctx->remote_port;
__u8 *p = bpf_map_lookup_elem(&pending, &ck);
if (!p) return SK_PASS;
__u32 key = 0;
struct bpf_sock *sk = bpf_map_lookup_elem(&daemon_sock, &key);
if (!sk) return SK_PASS;
long ret = bpf_sk_assign(ctx, sk, 0);
bpf_sk_release(sk);
if (ret == 0)
bpf_map_delete_elem(&pending, &ck);
// SK_PASS after bpf_sk_assign means "use the assigned socket"
return SK_PASS;
}
char LICENSE[] SEC("license") = "GPL";