samples/bpf/xdp_redirect_cpu_kern.c - linux-mtk - Git at Google

 /*  XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
  *
  *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 #include <uapi/linux/if_ether.h>
 #include <uapi/linux/if_packet.h>
 #include <uapi/linux/if_vlan.h>
 #include <uapi/linux/ip.h>
 #include <uapi/linux/ipv6.h>
 #include <uapi/linux/in.h>
 #include <uapi/linux/tcp.h>
 #include <uapi/linux/udp.h>

 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
 #include "hash_func01.h"

 #define MAX_CPUS 64 /* WARNING - sync with _user.c */

 /* Special map type that can XDP_REDIRECT frames to another CPU */
 struct bpf_map_def SEC("maps") cpu_map = {
 	.type		= BPF_MAP_TYPE_CPUMAP,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(u32),
 	.max_entries	= MAX_CPUS,
 };

 /* Common stats data record to keep userspace more simple */
 struct datarec {
 	__u64 processed;
 	__u64 dropped;
 	__u64 issue;
 };

 /* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
  * feedback.  Redirect TX errors can be caught via a tracepoint.
  */
 struct bpf_map_def SEC("maps") rx_cnt = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(struct datarec),
 	.max_entries	= 1,
 };

 /* Used by trace point */
 struct bpf_map_def SEC("maps") redirect_err_cnt = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(struct datarec),
 	.max_entries	= 2,
 	/* TODO: have entries for all possible errno's */
 };

 /* Used by trace point */
 struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(struct datarec),
 	.max_entries	= MAX_CPUS,
 };

 /* Used by trace point */
 struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(struct datarec),
 	.max_entries	= 1,
 };

 /* Set of maps controlling available CPU, and for iterating through
  * selectable redirect CPUs.
  */
 struct bpf_map_def SEC("maps") cpus_available = {
 	.type		= BPF_MAP_TYPE_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(u32),
 	.max_entries	= MAX_CPUS,
 };
 struct bpf_map_def SEC("maps") cpus_count = {
 	.type		= BPF_MAP_TYPE_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(u32),
 	.max_entries	= 1,
 };
 struct bpf_map_def SEC("maps") cpus_iterator = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(u32),
 	.max_entries	= 1,
 };

 /* Used by trace point */
 struct bpf_map_def SEC("maps") exception_cnt = {
 	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
 	.key_size	= sizeof(u32),
 	.value_size	= sizeof(struct datarec),
 	.max_entries	= 1,
 };

 /* Helper parse functions */

 /* Parse Ethernet layer 2, extract network layer 3 offset and protocol
  *
  * Returns false on error and non-supported ether-type
  */
 struct vlan_hdr {
 	__be16 h_vlan_TCI;
 	__be16 h_vlan_encapsulated_proto;
 };

 static __always_inline
 bool parse_eth(struct ethhdr *eth, void *data_end,
 	       u16 *eth_proto, u64 *l3_offset)
 {
 	u16 eth_type;
 	u64 offset;

 	offset = sizeof(*eth);
 	if ((void *)eth + offset > data_end)
 		return false;

 	eth_type = eth->h_proto;

 	/* Skip non 802.3 Ethertypes */
 	if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
 		return false;

 	/* Handle VLAN tagged packet */
 	if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
 		struct vlan_hdr *vlan_hdr;

 		vlan_hdr = (void *)eth + offset;
 		offset += sizeof(*vlan_hdr);
 		if ((void *)eth + offset > data_end)
 			return false;
 		eth_type = vlan_hdr->h_vlan_encapsulated_proto;
 	}
 	/* Handle double VLAN tagged packet */
 	if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
 		struct vlan_hdr *vlan_hdr;

 		vlan_hdr = (void *)eth + offset;
 		offset += sizeof(*vlan_hdr);
 		if ((void *)eth + offset > data_end)
 			return false;
 		eth_type = vlan_hdr->h_vlan_encapsulated_proto;
 	}

 	*eth_proto = ntohs(eth_type);
 	*l3_offset = offset;
 	return true;
 }

 static __always_inline
 u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct iphdr *iph = data + nh_off;
 	struct udphdr *udph;
 	u16 dport;

 	if (iph + 1 > data_end)
 		return 0;
 	if (!(iph->protocol == IPPROTO_UDP))
 		return 0;

 	udph = (void *)(iph + 1);
 	if (udph + 1 > data_end)
 		return 0;

 	dport = ntohs(udph->dest);
 	return dport;
 }

 static __always_inline
 int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct iphdr *iph = data + nh_off;

 	if (iph + 1 > data_end)
 		return 0;
 	return iph->protocol;
 }

 static __always_inline
 int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ipv6hdr *ip6h = data + nh_off;

 	if (ip6h + 1 > data_end)
 		return 0;
 	return ip6h->nexthdr;
 }

 SEC("xdp_cpu_map0")
 int  xdp_prognum0_no_touch(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct datarec *rec;
 	u32 *cpu_selected;
 	u32 cpu_dest;
 	u32 key = 0;

 	/* Only use first entry in cpus_available */
 	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 	if (!cpu_selected)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_selected;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 SEC("xdp_cpu_map1_touch_data")
 int  xdp_prognum1_touch_data(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ethhdr *eth = data;
 	struct datarec *rec;
 	u32 *cpu_selected;
 	u32 cpu_dest;
 	u16 eth_type;
 	u32 key = 0;

 	/* Only use first entry in cpus_available */
 	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 	if (!cpu_selected)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_selected;

 	/* Validate packet length is minimum Eth header size */
 	if (eth + 1 > data_end)
 		return XDP_ABORTED;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	/* Read packet data, and use it (drop non 802.3 Ethertypes) */
 	eth_type = eth->h_proto;
 	if (ntohs(eth_type) < ETH_P_802_3_MIN) {
 		rec->dropped++;
 		return XDP_DROP;
 	}

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 SEC("xdp_cpu_map2_round_robin")
 int  xdp_prognum2_round_robin(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ethhdr *eth = data;
 	struct datarec *rec;
 	u32 cpu_dest;
 	u32 *cpu_lookup;
 	u32 key0 = 0;

 	u32 *cpu_selected;
 	u32 *cpu_iterator;
 	u32 *cpu_max;
 	u32 cpu_idx;

 	cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
 	if (!cpu_max)
 		return XDP_ABORTED;

 	cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
 	if (!cpu_iterator)
 		return XDP_ABORTED;
 	cpu_idx = *cpu_iterator;

 	*cpu_iterator += 1;
 	if (*cpu_iterator == *cpu_max)
 		*cpu_iterator = 0;

 	cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 	if (!cpu_selected)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_selected;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key0);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 SEC("xdp_cpu_map3_proto_separate")
 int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ethhdr *eth = data;
 	u8 ip_proto = IPPROTO_UDP;
 	struct datarec *rec;
 	u16 eth_proto = 0;
 	u64 l3_offset = 0;
 	u32 cpu_dest = 0;
 	u32 cpu_idx = 0;
 	u32 *cpu_lookup;
 	u32 key = 0;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 		return XDP_PASS; /* Just skip */

 	/* Extract L4 protocol */
 	switch (eth_proto) {
 	case ETH_P_IP:
 		ip_proto = get_proto_ipv4(ctx, l3_offset);
 		break;
 	case ETH_P_IPV6:
 		ip_proto = get_proto_ipv6(ctx, l3_offset);
 		break;
 	case ETH_P_ARP:
 		cpu_idx = 0; /* ARP packet handled on separate CPU */
 		break;
 	default:
 		cpu_idx = 0;
 	}

 	/* Choose CPU based on L4 protocol */
 	switch (ip_proto) {
 	case IPPROTO_ICMP:
 	case IPPROTO_ICMPV6:
 		cpu_idx = 2;
 		break;
 	case IPPROTO_TCP:
 		cpu_idx = 0;
 		break;
 	case IPPROTO_UDP:
 		cpu_idx = 1;
 		break;
 	default:
 		cpu_idx = 0;
 	}

 	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 	if (!cpu_lookup)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_lookup;

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 SEC("xdp_cpu_map4_ddos_filter_pktgen")
 int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ethhdr *eth = data;
 	u8 ip_proto = IPPROTO_UDP;
 	struct datarec *rec;
 	u16 eth_proto = 0;
 	u64 l3_offset = 0;
 	u32 cpu_dest = 0;
 	u32 cpu_idx = 0;
 	u16 dest_port;
 	u32 *cpu_lookup;
 	u32 key = 0;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 		return XDP_PASS; /* Just skip */

 	/* Extract L4 protocol */
 	switch (eth_proto) {
 	case ETH_P_IP:
 		ip_proto = get_proto_ipv4(ctx, l3_offset);
 		break;
 	case ETH_P_IPV6:
 		ip_proto = get_proto_ipv6(ctx, l3_offset);
 		break;
 	case ETH_P_ARP:
 		cpu_idx = 0; /* ARP packet handled on separate CPU */
 		break;
 	default:
 		cpu_idx = 0;
 	}

 	/* Choose CPU based on L4 protocol */
 	switch (ip_proto) {
 	case IPPROTO_ICMP:
 	case IPPROTO_ICMPV6:
 		cpu_idx = 2;
 		break;
 	case IPPROTO_TCP:
 		cpu_idx = 0;
 		break;
 	case IPPROTO_UDP:
 		cpu_idx = 1;
 		/* DDoS filter UDP port 9 (pktgen) */
 		dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
 		if (dest_port == 9) {
 			if (rec)
 				rec->dropped++;
 			return XDP_DROP;
 		}
 		break;
 	default:
 		cpu_idx = 0;
 	}

 	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 	if (!cpu_lookup)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_lookup;

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 /* Hashing initval */
 #define INITVAL 15485863

 static __always_inline
 u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct iphdr *iph = data + nh_off;
 	u32 cpu_hash;

 	if (iph + 1 > data_end)
 		return 0;

 	cpu_hash = iph->saddr + iph->daddr;
 	cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);

 	return cpu_hash;
 }

 static __always_inline
 u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ipv6hdr *ip6h = data + nh_off;
 	u32 cpu_hash;

 	if (ip6h + 1 > data_end)
 		return 0;

 	cpu_hash  = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
 	cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
 	cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
 	cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
 	cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);

 	return cpu_hash;
 }

 /* Load-Balance traffic based on hashing IP-addrs + L4-proto.  The
  * hashing scheme is symmetric, meaning swapping IP src/dest still hit
  * same CPU.
  */
 SEC("xdp_cpu_map5_lb_hash_ip_pairs")
 int  xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
 	void *data     = (void *)(long)ctx->data;
 	struct ethhdr *eth = data;
 	u8 ip_proto = IPPROTO_UDP;
 	struct datarec *rec;
 	u16 eth_proto = 0;
 	u64 l3_offset = 0;
 	u32 cpu_dest = 0;
 	u32 cpu_idx = 0;
 	u32 *cpu_lookup;
 	u32 *cpu_max;
 	u32 cpu_hash;
 	u32 key = 0;

 	/* Count RX packet in map */
 	rec = bpf_map_lookup_elem(&rx_cnt, &key);
 	if (!rec)
 		return XDP_ABORTED;
 	rec->processed++;

 	cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
 	if (!cpu_max)
 		return XDP_ABORTED;

 	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 		return XDP_PASS; /* Just skip */

 	/* Hash for IPv4 and IPv6 */
 	switch (eth_proto) {
 	case ETH_P_IP:
 		cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
 		break;
 	case ETH_P_IPV6:
 		cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
 		break;
 	case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
 	default:
 		cpu_hash = 0;
 	}

 	/* Choose CPU based on hash */
 	cpu_idx = cpu_hash % *cpu_max;

 	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 	if (!cpu_lookup)
 		return XDP_ABORTED;
 	cpu_dest = *cpu_lookup;

 	if (cpu_dest >= MAX_CPUS) {
 		rec->issue++;
 		return XDP_ABORTED;
 	}

 	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 }

 char _license[] SEC("license") = "GPL";

 /*** Trace point code ***/

 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
  * Code in:                kernel/include/trace/events/xdp.h
  */
 struct xdp_redirect_ctx {
 	u64 __pad;	// First 8 bytes are not accessible by bpf code
 	int prog_id;	//	offset:8;  size:4; signed:1;
 	u32 act;	//	offset:12  size:4; signed:0;
 	int ifindex;	//	offset:16  size:4; signed:1;
 	int err;	//	offset:20  size:4; signed:1;
 	int to_ifindex;	//	offset:24  size:4; signed:1;
 	u32 map_id;	//	offset:28  size:4; signed:0;
 	int map_index;	//	offset:32  size:4; signed:1;
 };			//	offset:36

 enum {
 	XDP_REDIRECT_SUCCESS = 0,
 	XDP_REDIRECT_ERROR = 1
 };

 static __always_inline
 int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 {
 	u32 key = XDP_REDIRECT_ERROR;
 	struct datarec *rec;
 	int err = ctx->err;

 	if (!err)
 		key = XDP_REDIRECT_SUCCESS;

 	rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
 	if (!rec)
 		return 0;
 	rec->dropped += 1;

 	return 0; /* Indicate event was filtered (no further processing)*/
 	/*
 	 * Returning 1 here would allow e.g. a perf-record tracepoint
 	 * to see and record these events, but it doesn't work well
 	 * in-practice as stopping perf-record also unload this
 	 * bpf_prog.  Plus, there is additional overhead of doing so.
 	 */
 }

 SEC("tracepoint/xdp/xdp_redirect_err")
 int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }

 SEC("tracepoint/xdp/xdp_redirect_map_err")
 int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
 {
 	return xdp_redirect_collect_stat(ctx);
 }

 /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
  * Code in:                kernel/include/trace/events/xdp.h
  */
 struct xdp_exception_ctx {
 	u64 __pad;	// First 8 bytes are not accessible by bpf code
 	int prog_id;	//	offset:8;  size:4; signed:1;
 	u32 act;	//	offset:12; size:4; signed:0;
 	int ifindex;	//	offset:16; size:4; signed:1;
 };

 SEC("tracepoint/xdp/xdp_exception")
 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 {
 	struct datarec *rec;
 	u32 key = 0;

 	rec = bpf_map_lookup_elem(&exception_cnt, &key);
 	if (!rec)
 		return 1;
 	rec->dropped += 1;

 	return 0;
 }

 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
  * Code in:         kernel/include/trace/events/xdp.h
  */
 struct cpumap_enqueue_ctx {
 	u64 __pad;		// First 8 bytes are not accessible by bpf code
 	int map_id;		//	offset:8;  size:4; signed:1;
 	u32 act;		//	offset:12; size:4; signed:0;
 	int cpu;		//	offset:16; size:4; signed:1;
 	unsigned int drops;	//	offset:20; size:4; signed:0;
 	unsigned int processed;	//	offset:24; size:4; signed:0;
 	int to_cpu;		//	offset:28; size:4; signed:1;
 };

 SEC("tracepoint/xdp/xdp_cpumap_enqueue")
 int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
 {
 	u32 to_cpu = ctx->to_cpu;
 	struct datarec *rec;

 	if (to_cpu >= MAX_CPUS)
 		return 1;

 	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
 	if (!rec)
 		return 0;
 	rec->processed += ctx->processed;
 	rec->dropped   += ctx->drops;

 	/* Record bulk events, then userspace can calc average bulk size */
 	if (ctx->processed > 0)
 		rec->issue += 1;

 	/* Inception: It's possible to detect overload situations, via
 	 * this tracepoint.  This can be used for creating a feedback
 	 * loop to XDP, which can take appropriate actions to mitigate
 	 * this overload situation.
 	 */
 	return 0;
 }

 /* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
  * Code in:         kernel/include/trace/events/xdp.h
  */
 struct cpumap_kthread_ctx {
 	u64 __pad;		// First 8 bytes are not accessible by bpf code
 	int map_id;		//	offset:8;  size:4; signed:1;
 	u32 act;		//	offset:12; size:4; signed:0;
 	int cpu;		//	offset:16; size:4; signed:1;
 	unsigned int drops;	//	offset:20; size:4; signed:0;
 	unsigned int processed;	//	offset:24; size:4; signed:0;
 	int sched;		//	offset:28; size:4; signed:1;
 };

 SEC("tracepoint/xdp/xdp_cpumap_kthread")
 int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 {
 	struct datarec *rec;
 	u32 key = 0;

 	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
 	if (!rec)
 		return 0;
 	rec->processed += ctx->processed;
 	rec->dropped   += ctx->drops;

 	/* Count times kthread yielded CPU via schedule call */
 	if (ctx->sched)
 		rec->issue++;

 	return 0;
 }
	/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
	*
	* GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
	*/
	#include <uapi/linux/if_ether.h>
	#include <uapi/linux/if_packet.h>
	#include <uapi/linux/if_vlan.h>
	#include <uapi/linux/ip.h>
	#include <uapi/linux/ipv6.h>
	#include <uapi/linux/in.h>
	#include <uapi/linux/tcp.h>
	#include <uapi/linux/udp.h>

	#include <uapi/linux/bpf.h>
	#include "bpf_helpers.h"
	#include "hash_func01.h"

	#define MAX_CPUS 64 /* WARNING - sync with _user.c */

	/* Special map type that can XDP_REDIRECT frames to another CPU */
	struct bpf_map_def SEC("maps") cpu_map = {
	.type = BPF_MAP_TYPE_CPUMAP,
	.key_size = sizeof(u32),
	.value_size = sizeof(u32),
	.max_entries = MAX_CPUS,
	};

	/* Common stats data record to keep userspace more simple */
	struct datarec {
	__u64 processed;
	__u64 dropped;
	__u64 issue;
	};

	/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
	* feedback. Redirect TX errors can be caught via a tracepoint.
	*/
	struct bpf_map_def SEC("maps") rx_cnt = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct datarec),
	.max_entries = 1,
	};

	/* Used by trace point */
	struct bpf_map_def SEC("maps") redirect_err_cnt = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct datarec),
	.max_entries = 2,
	/* TODO: have entries for all possible errno's */
	};

	/* Used by trace point */
	struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct datarec),
	.max_entries = MAX_CPUS,
	};

	/* Used by trace point */
	struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct datarec),
	.max_entries = 1,
	};

	/* Set of maps controlling available CPU, and for iterating through
	* selectable redirect CPUs.
	*/
	struct bpf_map_def SEC("maps") cpus_available = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(u32),
	.max_entries = MAX_CPUS,
	};
	struct bpf_map_def SEC("maps") cpus_count = {
	.type = BPF_MAP_TYPE_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(u32),
	.max_entries = 1,
	};
	struct bpf_map_def SEC("maps") cpus_iterator = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(u32),
	.max_entries = 1,
	};

	/* Used by trace point */
	struct bpf_map_def SEC("maps") exception_cnt = {
	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct datarec),
	.max_entries = 1,
	};

	/* Helper parse functions */

	/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
	*
	* Returns false on error and non-supported ether-type
	*/
	struct vlan_hdr {
	__be16 h_vlan_TCI;
	__be16 h_vlan_encapsulated_proto;
	};

	static __always_inline
	bool parse_eth(struct ethhdr eth, void data_end,
	u16 eth_proto, u64 l3_offset)
	{
	u16 eth_type;
	u64 offset;

	offset = sizeof(*eth);
	if ((void *)eth + offset > data_end)
	return false;

	eth_type = eth->h_proto;

	/* Skip non 802.3 Ethertypes */
	if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
	return false;

	/* Handle VLAN tagged packet */
	if (eth_type == htons(ETH_P_8021Q) \|\| eth_type == htons(ETH_P_8021AD)) {
	struct vlan_hdr *vlan_hdr;

	vlan_hdr = (void *)eth + offset;
	offset += sizeof(*vlan_hdr);
	if ((void *)eth + offset > data_end)
	return false;
	eth_type = vlan_hdr->h_vlan_encapsulated_proto;
	}
	/* Handle double VLAN tagged packet */
	if (eth_type == htons(ETH_P_8021Q) \|\| eth_type == htons(ETH_P_8021AD)) {
	struct vlan_hdr *vlan_hdr;

	vlan_hdr = (void *)eth + offset;
	offset += sizeof(*vlan_hdr);
	if ((void *)eth + offset > data_end)
	return false;
	eth_type = vlan_hdr->h_vlan_encapsulated_proto;
	}

	*eth_proto = ntohs(eth_type);
	*l3_offset = offset;
	return true;
	}

	static __always_inline
	u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct iphdr *iph = data + nh_off;
	struct udphdr *udph;
	u16 dport;

	if (iph + 1 > data_end)
	return 0;
	if (!(iph->protocol == IPPROTO_UDP))
	return 0;

	udph = (void *)(iph + 1);
	if (udph + 1 > data_end)
	return 0;

	dport = ntohs(udph->dest);
	return dport;
	}

	static __always_inline
	int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct iphdr *iph = data + nh_off;

	if (iph + 1 > data_end)
	return 0;
	return iph->protocol;
	}

	static __always_inline
	int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ipv6hdr *ip6h = data + nh_off;

	if (ip6h + 1 > data_end)
	return 0;
	return ip6h->nexthdr;
	}

	SEC("xdp_cpu_map0")
	int xdp_prognum0_no_touch(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct datarec *rec;
	u32 *cpu_selected;
	u32 cpu_dest;
	u32 key = 0;

	/* Only use first entry in cpus_available */
	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
	if (!cpu_selected)
	return XDP_ABORTED;
	cpu_dest = *cpu_selected;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	SEC("xdp_cpu_map1_touch_data")
	int xdp_prognum1_touch_data(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ethhdr *eth = data;
	struct datarec *rec;
	u32 *cpu_selected;
	u32 cpu_dest;
	u16 eth_type;
	u32 key = 0;

	/* Only use first entry in cpus_available */
	cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
	if (!cpu_selected)
	return XDP_ABORTED;
	cpu_dest = *cpu_selected;

	/* Validate packet length is minimum Eth header size */
	if (eth + 1 > data_end)
	return XDP_ABORTED;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	/* Read packet data, and use it (drop non 802.3 Ethertypes) */
	eth_type = eth->h_proto;
	if (ntohs(eth_type) < ETH_P_802_3_MIN) {
	rec->dropped++;
	return XDP_DROP;
	}

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	SEC("xdp_cpu_map2_round_robin")
	int xdp_prognum2_round_robin(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ethhdr *eth = data;
	struct datarec *rec;
	u32 cpu_dest;
	u32 *cpu_lookup;
	u32 key0 = 0;

	u32 *cpu_selected;
	u32 *cpu_iterator;
	u32 *cpu_max;
	u32 cpu_idx;

	cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
	if (!cpu_max)
	return XDP_ABORTED;

	cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
	if (!cpu_iterator)
	return XDP_ABORTED;
	cpu_idx = *cpu_iterator;

	*cpu_iterator += 1;
	if (cpu_iterator == cpu_max)
	*cpu_iterator = 0;

	cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
	if (!cpu_selected)
	return XDP_ABORTED;
	cpu_dest = *cpu_selected;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key0);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	SEC("xdp_cpu_map3_proto_separate")
	int xdp_prognum3_proto_separate(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ethhdr *eth = data;
	u8 ip_proto = IPPROTO_UDP;
	struct datarec *rec;
	u16 eth_proto = 0;
	u64 l3_offset = 0;
	u32 cpu_dest = 0;
	u32 cpu_idx = 0;
	u32 *cpu_lookup;
	u32 key = 0;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
	return XDP_PASS; /* Just skip */

	/* Extract L4 protocol */
	switch (eth_proto) {
	case ETH_P_IP:
	ip_proto = get_proto_ipv4(ctx, l3_offset);
	break;
	case ETH_P_IPV6:
	ip_proto = get_proto_ipv6(ctx, l3_offset);
	break;
	case ETH_P_ARP:
	cpu_idx = 0; /* ARP packet handled on separate CPU */
	break;
	default:
	cpu_idx = 0;
	}

	/* Choose CPU based on L4 protocol */
	switch (ip_proto) {
	case IPPROTO_ICMP:
	case IPPROTO_ICMPV6:
	cpu_idx = 2;
	break;
	case IPPROTO_TCP:
	cpu_idx = 0;
	break;
	case IPPROTO_UDP:
	cpu_idx = 1;
	break;
	default:
	cpu_idx = 0;
	}

	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
	if (!cpu_lookup)
	return XDP_ABORTED;
	cpu_dest = *cpu_lookup;

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	SEC("xdp_cpu_map4_ddos_filter_pktgen")
	int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ethhdr *eth = data;
	u8 ip_proto = IPPROTO_UDP;
	struct datarec *rec;
	u16 eth_proto = 0;
	u64 l3_offset = 0;
	u32 cpu_dest = 0;
	u32 cpu_idx = 0;
	u16 dest_port;
	u32 *cpu_lookup;
	u32 key = 0;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
	return XDP_PASS; /* Just skip */

	/* Extract L4 protocol */
	switch (eth_proto) {
	case ETH_P_IP:
	ip_proto = get_proto_ipv4(ctx, l3_offset);
	break;
	case ETH_P_IPV6:
	ip_proto = get_proto_ipv6(ctx, l3_offset);
	break;
	case ETH_P_ARP:
	cpu_idx = 0; /* ARP packet handled on separate CPU */
	break;
	default:
	cpu_idx = 0;
	}

	/* Choose CPU based on L4 protocol */
	switch (ip_proto) {
	case IPPROTO_ICMP:
	case IPPROTO_ICMPV6:
	cpu_idx = 2;
	break;
	case IPPROTO_TCP:
	cpu_idx = 0;
	break;
	case IPPROTO_UDP:
	cpu_idx = 1;
	/* DDoS filter UDP port 9 (pktgen) */
	dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
	if (dest_port == 9) {
	if (rec)
	rec->dropped++;
	return XDP_DROP;
	}
	break;
	default:
	cpu_idx = 0;
	}

	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
	if (!cpu_lookup)
	return XDP_ABORTED;
	cpu_dest = *cpu_lookup;

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	/* Hashing initval */
	#define INITVAL 15485863

	static __always_inline
	u32 get_ipv4_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct iphdr *iph = data + nh_off;
	u32 cpu_hash;

	if (iph + 1 > data_end)
	return 0;

	cpu_hash = iph->saddr + iph->daddr;
	cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);

	return cpu_hash;
	}

	static __always_inline
	u32 get_ipv6_hash_ip_pair(struct xdp_md *ctx, u64 nh_off)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ipv6hdr *ip6h = data + nh_off;
	u32 cpu_hash;

	if (ip6h + 1 > data_end)
	return 0;

	cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
	cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
	cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
	cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
	cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + ip6h->nexthdr);

	return cpu_hash;
	}

	/* Load-Balance traffic based on hashing IP-addrs + L4-proto. The
	* hashing scheme is symmetric, meaning swapping IP src/dest still hit
	* same CPU.
	*/
	SEC("xdp_cpu_map5_lb_hash_ip_pairs")
	int xdp_prognum5_lb_hash_ip_pairs(struct xdp_md *ctx)
	{
	void data_end = (void )(long)ctx->data_end;
	void data = (void )(long)ctx->data;
	struct ethhdr *eth = data;
	u8 ip_proto = IPPROTO_UDP;
	struct datarec *rec;
	u16 eth_proto = 0;
	u64 l3_offset = 0;
	u32 cpu_dest = 0;
	u32 cpu_idx = 0;
	u32 *cpu_lookup;
	u32 *cpu_max;
	u32 cpu_hash;
	u32 key = 0;

	/* Count RX packet in map */
	rec = bpf_map_lookup_elem(&rx_cnt, &key);
	if (!rec)
	return XDP_ABORTED;
	rec->processed++;

	cpu_max = bpf_map_lookup_elem(&cpus_count, &key);
	if (!cpu_max)
	return XDP_ABORTED;

	if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
	return XDP_PASS; /* Just skip */

	/* Hash for IPv4 and IPv6 */
	switch (eth_proto) {
	case ETH_P_IP:
	cpu_hash = get_ipv4_hash_ip_pair(ctx, l3_offset);
	break;
	case ETH_P_IPV6:
	cpu_hash = get_ipv6_hash_ip_pair(ctx, l3_offset);
	break;
	case ETH_P_ARP: /* ARP packet handled on CPU idx 0 */
	default:
	cpu_hash = 0;
	}

	/* Choose CPU based on hash */
	cpu_idx = cpu_hash % *cpu_max;

	cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
	if (!cpu_lookup)
	return XDP_ABORTED;
	cpu_dest = *cpu_lookup;

	if (cpu_dest >= MAX_CPUS) {
	rec->issue++;
	return XDP_ABORTED;
	}

	return bpf_redirect_map(&cpu_map, cpu_dest, 0);
	}

	char _license[] SEC("license") = "GPL";

	/* Trace point code */

	/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
	* Code in: kernel/include/trace/events/xdp.h
	*/
	struct xdp_redirect_ctx {
	u64 __pad; // First 8 bytes are not accessible by bpf code
	int prog_id; // offset:8; size:4; signed:1;
	u32 act; // offset:12 size:4; signed:0;
	int ifindex; // offset:16 size:4; signed:1;
	int err; // offset:20 size:4; signed:1;
	int to_ifindex; // offset:24 size:4; signed:1;
	u32 map_id; // offset:28 size:4; signed:0;
	int map_index; // offset:32 size:4; signed:1;
	}; // offset:36

	enum {
	XDP_REDIRECT_SUCCESS = 0,
	XDP_REDIRECT_ERROR = 1
	};

	static __always_inline
	int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
	{
	u32 key = XDP_REDIRECT_ERROR;
	struct datarec *rec;
	int err = ctx->err;

	if (!err)
	key = XDP_REDIRECT_SUCCESS;

	rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
	if (!rec)
	return 0;
	rec->dropped += 1;

	return 0; /* Indicate event was filtered (no further processing)*/
	/*
	* Returning 1 here would allow e.g. a perf-record tracepoint
	* to see and record these events, but it doesn't work well
	* in-practice as stopping perf-record also unload this
	* bpf_prog. Plus, there is additional overhead of doing so.
	*/
	}

	SEC("tracepoint/xdp/xdp_redirect_err")
	int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
	{
	return xdp_redirect_collect_stat(ctx);
	}

	SEC("tracepoint/xdp/xdp_redirect_map_err")
	int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
	{
	return xdp_redirect_collect_stat(ctx);
	}

	/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
	* Code in: kernel/include/trace/events/xdp.h
	*/
	struct xdp_exception_ctx {
	u64 __pad; // First 8 bytes are not accessible by bpf code
	int prog_id; // offset:8; size:4; signed:1;
	u32 act; // offset:12; size:4; signed:0;
	int ifindex; // offset:16; size:4; signed:1;
	};

	SEC("tracepoint/xdp/xdp_exception")
	int trace_xdp_exception(struct xdp_exception_ctx *ctx)
	{
	struct datarec *rec;
	u32 key = 0;

	rec = bpf_map_lookup_elem(&exception_cnt, &key);
	if (!rec)
	return 1;
	rec->dropped += 1;

	return 0;
	}

	/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
	* Code in: kernel/include/trace/events/xdp.h
	*/
	struct cpumap_enqueue_ctx {
	u64 __pad; // First 8 bytes are not accessible by bpf code
	int map_id; // offset:8; size:4; signed:1;
	u32 act; // offset:12; size:4; signed:0;
	int cpu; // offset:16; size:4; signed:1;
	unsigned int drops; // offset:20; size:4; signed:0;
	unsigned int processed; // offset:24; size:4; signed:0;
	int to_cpu; // offset:28; size:4; signed:1;
	};

	SEC("tracepoint/xdp/xdp_cpumap_enqueue")
	int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
	{
	u32 to_cpu = ctx->to_cpu;
	struct datarec *rec;

	if (to_cpu >= MAX_CPUS)
	return 1;

	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
	if (!rec)
	return 0;
	rec->processed += ctx->processed;
	rec->dropped += ctx->drops;

	/* Record bulk events, then userspace can calc average bulk size */
	if (ctx->processed > 0)
	rec->issue += 1;

	/* Inception: It's possible to detect overload situations, via
	* this tracepoint. This can be used for creating a feedback
	* loop to XDP, which can take appropriate actions to mitigate
	* this overload situation.
	*/
	return 0;
	}

	/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
	* Code in: kernel/include/trace/events/xdp.h
	*/
	struct cpumap_kthread_ctx {
	u64 __pad; // First 8 bytes are not accessible by bpf code
	int map_id; // offset:8; size:4; signed:1;
	u32 act; // offset:12; size:4; signed:0;
	int cpu; // offset:16; size:4; signed:1;
	unsigned int drops; // offset:20; size:4; signed:0;
	unsigned int processed; // offset:24; size:4; signed:0;
	int sched; // offset:28; size:4; signed:1;
	};

	SEC("tracepoint/xdp/xdp_cpumap_kthread")
	int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
	{
	struct datarec *rec;
	u32 key = 0;

	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
	if (!rec)
	return 0;
	rec->processed += ctx->processed;
	rec->dropped += ctx->drops;

	/* Count times kthread yielded CPU via schedule call */
	if (ctx->sched)
	rec->issue++;

	return 0;
	}