blob: 01a201d34453e14c4a247a2a5f274fa541fdbde6 [file] [log] [blame] [edit]
--[[
Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
]]
local ffi = require('ffi')
local BPF = ffi.typeof('struct bpf')
ffi.cdef [[
struct sk_buff {
uint32_t len;
uint32_t pkt_type;
uint32_t mark;
uint32_t queue_mapping;
uint32_t protocol;
uint32_t vlan_present;
uint32_t vlan_tci;
uint32_t vlan_proto;
uint32_t priority;
uint32_t ingress_ifindex;
uint32_t ifindex;
uint32_t tc_index;
uint32_t cb[5];
uint32_t hash;
uint32_t tc_classid;
uint32_t data;
uint32_t data_end;
uint32_t napi_id;
/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
uint32_t family;
uint32_t remote_ip4; /* Stored in network byte order */
uint32_t local_ip4; /* Stored in network byte order */
uint32_t remote_ip6[4]; /* Stored in network byte order */
uint32_t local_ip6[4]; /* Stored in network byte order */
uint32_t remote_port; /* Stored in network byte order */
uint32_t local_port; /* stored in host byte order */
/* ... here. */
uint32_t data_meta;
};
struct net_off_t {
uint8_t ver:4;
} __attribute__((packed));
struct eth_t {
uint8_t dst[6];
uint8_t src[6];
uint16_t type;
} __attribute__((packed));
struct dot1q_t {
uint16_t pri:3;
uint16_t cfi:1;
uint16_t vlanid:12;
uint16_t type;
} __attribute__((packed));
struct arp_t {
uint16_t htype;
uint16_t ptype;
uint8_t hlen;
uint8_t plen;
uint16_t oper;
uint8_t sha[6];
uint32_t spa;
uint8_t tha[6];
uint32_t tpa;
} __attribute__((packed));
struct ip_t {
uint8_t ver:4;
uint8_t hlen:4;
uint8_t tos;
uint16_t tlen;
uint16_t identification;
uint16_t ffo_unused:1;
uint16_t df:1;
uint16_t mf:1;
uint16_t foffset:13;
uint8_t ttl;
uint8_t proto;
uint16_t hchecksum;
uint32_t src;
uint32_t dst;
} __attribute__((packed));
struct icmp_t {
uint8_t type;
uint8_t code;
uint16_t checksum;
} __attribute__((packed));
struct ip6_t {
uint32_t ver:4;
uint32_t priority:8;
uint32_t flow_label:20;
uint16_t payload_len;
uint8_t next_header;
uint8_t hop_limit;
uint64_t src_hi;
uint64_t src_lo;
uint64_t dst_hi;
uint64_t dst_lo;
} __attribute__((packed));
struct ip6_opt_t {
uint8_t next_header;
uint8_t ext_len;
uint8_t pad[6];
} __attribute__((packed));
struct icmp6_t {
uint8_t type;
uint8_t code;
uint16_t checksum;
} __attribute__((packed));
struct udp_t {
uint16_t src_port;
uint16_t dst_port;
uint16_t length;
uint16_t crc;
} __attribute__((packed));
struct tcp_t {
uint16_t src_port;
uint16_t dst_port;
uint32_t seq_num;
uint32_t ack_num;
uint8_t offset:4;
uint8_t reserved:4;
uint8_t flag_cwr:1;
uint8_t flag_ece:1;
uint8_t flag_urg:1;
uint8_t flag_ack:1;
uint8_t flag_psh:1;
uint8_t flag_rst:1;
uint8_t flag_syn:1;
uint8_t flag_fin:1;
uint16_t rcv_wnd;
uint16_t cksum;
uint16_t urg_ptr;
} __attribute__((packed));
struct vxlan_t {
uint32_t rsv1:4;
uint32_t iflag:1;
uint32_t rsv2:3;
uint32_t rsv3:24;
uint32_t key:24;
uint32_t rsv4:8;
} __attribute__((packed));
]]
-- Architecture-specific ptrace register layout
local S = require('syscall')
local arch = S.abi.arch
local parm_to_reg = {}
if arch == 'x64' then
ffi.cdef [[
struct pt_regs {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
};]]
parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'}
else
ffi.cdef 'struct pt_regs {};'
end
-- Map symbolic registers to architecture ABI
ffi.metatype('struct pt_regs', {
__index = function (_ --[[t]],k)
return assert(parm_to_reg[k], 'no such register: '..k)
end,
})
local M = {}
-- Dissector interface
local function dissector(type, e, dst, src, field)
local parent = e.V[src].const
-- Create new dissector variable
e.vcopy(dst, src)
-- Compute and materialize new dissector offset from parent
e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector}
parent.__dissector[field](e, dst)
e.V[dst].const.__dissector = type
end
M.dissector = dissector
-- Get current effective offset, load field value at an offset relative to it and
-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen)
local function next_offset(e, var, type, off, mask, shift)
local d = e.V[var].const
-- Materialize relative offset value in R0
local dst_reg, tmp_reg
if d.off then
dst_reg = e.vreg(var, 0, true)
tmp_reg = dst_reg -- Use target register to avoid copy
e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0)
else
tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset
dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var)
e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0)
end
-- Finalize relative offset
if mask then
e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask)
end
if shift and shift ~= 0 then
local op = BPF.LSH
if shift < 0 then
op = BPF.RSH
shift = -shift
end
e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift)
end
-- Add to base offset to turn it into effective address
if dst_reg ~= tmp_reg then
e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0)
else
e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off)
end
-- Discard temporary allocations
d.off = nil
e.V[e.tmpvar].reg = nil
end
local function next_skip(e, var, off)
local d = e.V[var].const
if not d.off then
local dst_reg = e.vreg(var)
e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off)
else
d.off = d.off + off
end
end
local function skip_eth(e, dst)
-- IP starts right after ETH header (fixed size)
local d = e.V[dst].const
d.off = d.off + ffi.sizeof('struct eth_t')
end
-- Export types
M.type = function(typestr, t)
t = t or {}
t.__dissector=ffi.typeof(typestr)
return t
end
M.skb = M.type('struct sk_buff', {source='ptr_to_ctx'})
M.pt_regs = M.type('struct pt_regs', {source='ptr_to_probe'})
M.pkt = M.type('struct eth_t', {off=0, source='ptr_to_pkt'}) -- skb needs special accessors
-- M.eth = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end
M.dot1q = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end
M.arp = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end
M.icmp = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end
M.ip = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end
M.icmp6 = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end
M.ip6 = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end
M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end
M.udp = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end
M.tcp = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end
M.vxlan = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end
M.data = function (...) return dissector(ffi.typeof('uint8_t'), ...) end
M.net_off = function (...) return dissector(ffi.typeof('struct net_off_t'), ...) end
-- Metatables
ffi.metatype(ffi.typeof('struct eth_t'), {
__index = {
ip = skip_eth,
ip6 = skip_eth,
net_off = function (e, dst)
next_skip(e, dst, BPF.NET_OFF)
end,
}
})
ffi.metatype(ffi.typeof('struct net_off_t'), {
__index = {
ip = function () end,
ip6 = function () end,
}
})
ffi.metatype(ffi.typeof('struct ip_t'), {
__index = {
-- Skip IP header length (stored as number of words)
-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
-- Mask first nibble and shift by 2 (multiplication by 4)
icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end,
}
})
ffi.metatype(ffi.typeof('struct ip6_t'), {
__index = {
-- Skip fixed IPv6 header length (40 bytes)
-- The caller must check the value of `next_header` to skip any extension headers
icmp6 = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
udp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
tcp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
ip6_opt = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end,
}
})
local ip6_opt_ext_len_off = ffi.offsetof('struct ip6_opt_t', 'ext_len')
ffi.metatype(ffi.typeof('struct ip6_opt_t'), {
__index = {
-- Skip IPv6 extension header length (field `ext_len`)
icmp6 = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
ip6_opt = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end,
}
})
ffi.metatype(ffi.typeof('struct tcp_t'), {
__index = {
-- Skip TCP header length (stored as number of words)
-- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets
data = function(e, dst)
next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2)
end,
}
})
ffi.metatype(ffi.typeof('struct udp_t'), {
__index = {
-- Skip UDP header length (8 octets)
data = function(e, dst)
next_skip(e, dst, ffi.sizeof('struct udp_t'))
end,
}
})
-- Constants
M.c = {
eth = { -- Constants http://standards.ieee.org/regauth/ethertype
ip = 0x0800, -- IP (v4) protocol
ip6 = 0x86dd, -- IP (v6) protocol
arp = 0x0806, -- Address resolution protocol
revarp = 0x8035, -- Reverse addr resolution protocol
vlan = 0x8100, -- IEEE 802.1Q VLAN tagging
},
ip = {
-- Reserved Addresses
addr_any = 0x00000000, -- 0.0.0.0
addr_broadcast = 0xffffffff, -- 255.255.255.255
addr_loopback = 0x7f000001, -- 127.0.0.1
addr_mcast_all = 0xe0000001, -- 224.0.0.1
addr_mcast_local = 0xe00000ff, -- 224.0.0.255
-- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474")
tos_default = 0x00, -- default
tos_lowdelay = 0x10, -- low delay
tos_throughput = 0x08, -- high throughput
tos_reliability = 0x04, -- high reliability
tos_lowcost = 0x02, -- low monetary cost - XXX
tos_ect = 0x02, -- ECN-capable transport
tos_ce = 0x01, -- congestion experienced
-- Fragmentation flags (ip_off)
rf = 0x8000, -- reserved
df = 0x4000, -- don't fragment
mf = 0x2000, -- more fragments (not last frag)
offmask = 0x1fff, -- mask for fragment offset
-- Time-to-live (ip_ttl), seconds
ttl_default = 64, -- default ttl, RFC 1122, RFC 1340
ttl_max = 255, -- maximum ttl
-- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers
proto_ip = 0, -- dummy for IP
proto_hopopts = 0, -- IPv6 hop-by-hop options
proto_icmp = 1, -- ICMP
proto_igmp = 2, -- IGMP
proto_ggp = 3, -- gateway-gateway protocol
proto_ipip = 4, -- IP in IP
proto_st = 5, -- ST datagram mode
proto_tcp = 6, -- TCP
proto_cbt = 7, -- CBT
proto_egp = 8, -- exterior gateway protocol
proto_igp = 9, -- interior gateway protocol
proto_bbnrcc = 10, -- BBN RCC monitoring
proto_nvp = 11, -- Network Voice Protocol
proto_pup = 12, -- PARC universal packet
proto_argus = 13, -- ARGUS
proto_emcon = 14, -- EMCON
proto_xnet = 15, -- Cross Net Debugger
proto_chaos = 16, -- Chaos
proto_udp = 17, -- UDP
proto_mux = 18, -- multiplexing
proto_dcnmeas = 19, -- DCN measurement
proto_hmp = 20, -- Host Monitoring Protocol
proto_prm = 21, -- Packet Radio Measurement
proto_idp = 22, -- Xerox NS IDP
proto_trunk1 = 23, -- Trunk-1
proto_trunk2 = 24, -- Trunk-2
proto_leaf1 = 25, -- Leaf-1
proto_leaf2 = 26, -- Leaf-2
proto_rdp = 27, -- "Reliable Datagram" proto
proto_irtp = 28, -- Inet Reliable Transaction
proto_tp = 29, -- ISO TP class 4
proto_netblt = 30, -- Bulk Data Transfer
proto_mfpnsp = 31, -- MFE Network Services
proto_meritinp= 32, -- Merit Internodal Protocol
proto_sep = 33, -- Sequential Exchange proto
proto_3pc = 34, -- Third Party Connect proto
proto_idpr = 35, -- Interdomain Policy Route
proto_xtp = 36, -- Xpress Transfer Protocol
proto_ddp = 37, -- Datagram Delivery Proto
proto_cmtp = 38, -- IDPR Ctrl Message Trans
proto_tppp = 39, -- TP++ Transport Protocol
proto_il = 40, -- IL Transport Protocol
proto_ip6 = 41, -- IPv6
proto_sdrp = 42, -- Source Demand Routing
proto_routing = 43, -- IPv6 routing header
proto_fragment= 44, -- IPv6 fragmentation header
proto_rsvp = 46, -- Reservation protocol
proto_gre = 47, -- General Routing Encap
proto_mhrp = 48, -- Mobile Host Routing
proto_ena = 49, -- ENA
proto_esp = 50, -- Encap Security Payload
proto_ah = 51, -- Authentication Header
proto_inlsp = 52, -- Integated Net Layer Sec
proto_swipe = 53, -- SWIPE
proto_narp = 54, -- NBMA Address Resolution
proto_mobile = 55, -- Mobile IP, RFC 2004
proto_tlsp = 56, -- Transport Layer Security
proto_skip = 57, -- SKIP
proto_icmp6 = 58, -- ICMP for IPv6
proto_none = 59, -- IPv6 no next header
proto_dstopts = 60, -- IPv6 destination options
proto_anyhost = 61, -- any host internal proto
proto_cftp = 62, -- CFTP
proto_anynet = 63, -- any local network
proto_expak = 64, -- SATNET and Backroom EXPAK
proto_kryptolan = 65, -- Kryptolan
proto_rvd = 66, -- MIT Remote Virtual Disk
proto_ippc = 67, -- Inet Pluribus Packet Core
proto_distfs = 68, -- any distributed fs
proto_satmon = 69, -- SATNET Monitoring
proto_visa = 70, -- VISA Protocol
proto_ipcv = 71, -- Inet Packet Core Utility
proto_cpnx = 72, -- Comp Proto Net Executive
proto_cphb = 73, -- Comp Protocol Heart Beat
proto_wsn = 74, -- Wang Span Network
proto_pvp = 75, -- Packet Video Protocol
proto_brsatmon= 76, -- Backroom SATNET Monitor
proto_sunnd = 77, -- SUN ND Protocol
proto_wbmon = 78, -- WIDEBAND Monitoring
proto_wbexpak = 79, -- WIDEBAND EXPAK
proto_eon = 80, -- ISO CNLP
proto_vmtp = 81, -- Versatile Msg Transport
proto_svmtp = 82, -- Secure VMTP
proto_vines = 83, -- VINES
proto_ttp = 84, -- TTP
proto_nsfigp = 85, -- NSFNET-IGP
proto_dgp = 86, -- Dissimilar Gateway Proto
proto_tcf = 87, -- TCF
proto_eigrp = 88, -- EIGRP
proto_ospf = 89, -- Open Shortest Path First
proto_spriterpc= 90, -- Sprite RPC Protocol
proto_larp = 91, -- Locus Address Resolution
proto_mtp = 92, -- Multicast Transport Proto
proto_ax25 = 93, -- AX.25 Frames
proto_ipipencap= 94, -- yet-another IP encap
proto_micp = 95, -- Mobile Internet Ctrl
proto_sccsp = 96, -- Semaphore Comm Sec Proto
proto_etherip = 97, -- Ethernet in IPv4
proto_encap = 98, -- encapsulation header
proto_anyenc = 99, -- private encryption scheme
proto_gmtp = 100, -- GMTP
proto_ifmp = 101, -- Ipsilon Flow Mgmt Proto
proto_pnni = 102, -- PNNI over IP
proto_pim = 103, -- Protocol Indep Multicast
proto_aris = 104, -- ARIS
proto_scps = 105, -- SCPS
proto_qnx = 106, -- QNX
proto_an = 107, -- Active Networks
proto_ipcomp = 108, -- IP Payload Compression
proto_snp = 109, -- Sitara Networks Protocol
proto_compaqpeer= 110, -- Compaq Peer Protocol
proto_ipxip = 111, -- IPX in IP
proto_vrrp = 112, -- Virtual Router Redundancy
proto_pgm = 113, -- PGM Reliable Transport
proto_any0hop = 114, -- 0-hop protocol
proto_l2tp = 115, -- Layer 2 Tunneling Proto
proto_ddx = 116, -- D-II Data Exchange (DDX)
proto_iatp = 117, -- Interactive Agent Xfer
proto_stp = 118, -- Schedule Transfer Proto
proto_srp = 119, -- SpectraLink Radio Proto
proto_uti = 120, -- UTI
proto_smp = 121, -- Simple Message Protocol
proto_sm = 122, -- SM
proto_ptp = 123, -- Performance Transparency
proto_isis = 124, -- ISIS over IPv4
proto_fire = 125, -- FIRE
proto_crtp = 126, -- Combat Radio Transport
proto_crudp = 127, -- Combat Radio UDP
proto_sscopmce= 128, -- SSCOPMCE
proto_iplt = 129, -- IPLT
proto_sps = 130, -- Secure Packet Shield
proto_pipe = 131, -- Private IP Encap in IP
proto_sctp = 132, -- Stream Ctrl Transmission
proto_fc = 133, -- Fibre Channel
proto_rsvpign = 134, -- RSVP-E2E-IGNORE
proto_raw = 255, -- Raw IP packets
proto_reserved= 255, -- Reserved
},
}
return M