src/lua/bpf/bpf.lua - platform/external/bcc - Git at Google

 --[[
 Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com>

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ]]
 -- LuaJIT to BPF bytecode compiler.
 --
 -- The code generation phase is currently one-pass and produces:
 -- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt)
 -- * Variables with liveness analysis and other meta (spill information, compile-time value)
 --
 -- The code generator optimises as much as possible in single pass:
 -- * Fold compile-time expressions and constant propagation
 -- * Basic control flow analysis with dead code elimination (based on compile-time expressions)
 -- * Single-pass optimistic register allocation
 --
 -- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further
 -- optimisations such as:
 -- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used)
 -- * Common sub-expression elimination (relies on DCE and liveness analysis)
 -- * Orphan JMP elimination (removing this in first pass would break previous JMP targets)
 -- * Better register allocation (needs to be recomputed after optimisations)

 local ffi = require('ffi')
 local bit = require('bit')
 local S = require('syscall')
 local bytecode = require('bpf.ljbytecode')
 local cdef = require('bpf.cdef')
 local proto = require('bpf.proto')
 local builtins = require('bpf.builtins')

 -- Constants
 local ALWAYS, NEVER = -1, -2
 local BPF = ffi.typeof('struct bpf')
 local HELPER = ffi.typeof('struct bpf_func_id')

 -- Symbolic table of constant expressions over numbers
 local const_expr = {
 	ADD = function (a, b) return a + b end,
 	SUB = function (a, b) return a - b end,
 	DIV = function (a, b) return a / b end,
 	MOD = function (a, b) return a % b end,
 	JEQ = function (a, b) return a == b end,
 	JNE = function (a, b) return a ~= b end,
 	JGE = function (a, b) return a >= b end,
 	JGT = function (a, b) return a > b end,
 }

 local const_width = {
 	[1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW,
 }

 -- Built-ins that are strict only (never compile-time expandable)
 local builtins_strict = {
 	[ffi.new] = true,
 	[print]   = true,
 }

 -- Deep copy a table
 local function table_copy(t)
 	local copy = {}
 	for n,v in pairs(t) do
 		if type(v) == 'table' then
 			v = table_copy(v)
 		end
 		copy[n] = v
 	end
 	return copy
 end

 -- Return true if the constant part is a proxy
 local function is_proxy(x)
 	return type(x) == 'table' and (x.__dissector or x.__map or x.__base)
 end

 -- Create compiler closure
 local function create_emitter(env, stackslots, params, param_types)

 local V = {}   -- Variable tracking / register allocator
 local code = { -- Generated code
 	pc = 0, bc_pc = 0,
 	insn = ffi.new('struct bpf_insn[4096]'),
 	fixup = {},
 	reachable = true,
 	seen_cmp = nil,
 }
 local Vstate = {} -- Track variable layout at basic block exits

 -- Anything below this stack offset is free to use by caller
 -- @note: There is no tracking memory allocator, so the caller may
 -- lower it for persistent objects, but such memory will never
 -- be reclaimed and the caller is responsible for resetting stack
 -- top whenever the memory below is free to be reused
 local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t')

 local function emit(op, dst, src, off, imm)
 	local ins = code.insn[code.pc]
 	ins.code = op
 	ins.dst_reg = dst
 	ins.src_reg = src
 	ins.off = off
 	ins.imm = imm
 	code.pc = code.pc + 1
 end

 local function reg_spill(var)
 	local vinfo = V[var]
 	assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register')
 	vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width)
 	emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0)
 	vinfo.reg = nil
 end

 local function reg_fill(var, reg)
 	local vinfo = V[var]
 	assert(reg, 'attempt to fill variable to register but not register is allocated')
 	assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled')
 	emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0)
 	vinfo.reg = reg
 	vinfo.spill = nil
 end

 -- Allocate a register (lazy simple allocator)
 local function reg_alloc(var, reg)
 	-- Specific register requested, must spill/move existing variable
 	if reg then
 		for k,v in pairs(V) do -- Spill any variable that has this register
 			if v.reg == reg and not v.shadow then
 				reg_spill(k)
 				break
 			end
 		end
 		return reg
 	end
 	-- Find free or least recently used slot
 	local last, last_seen, used = nil, 0xffff, 0
 	for k,v in pairs(V) do
 		if v.reg then
 			if not v.live_to or v.live_to < last_seen then
 				last, last_seen = k, v.live_to or last_seen
 			end
 			used = bit.bor(used, bit.lshift(1, v.reg))
 		end
 	end
 	-- Attempt to select a free register from R7-R9 (callee saved)
 	local free = bit.bnot(used)
 	if     bit.band(free, 0x80) ~= 0 then reg = 7
 	elseif bit.band(free,0x100) ~= 0 then reg = 8
 	elseif bit.band(free,0x200) ~= 0 then reg = 9
 	end
 	-- Select another variable to be spilled
 	if not reg then
 		assert(last)
 		reg = V[last].reg
 		reg_spill(last)
 	end
 	assert(reg, 'VAR '..var..'fill/spill failed')
 	return reg
 end

 -- Set new variable
 local function vset(var, reg, const, vtype)
 	-- Must materialise all variables shadowing this variable slot, as it will be overwritten
 	if V[var] and V[var].reg then
 		for _, vinfo in pairs(V) do
 			-- Shadowing variable MUST share the same type and attributes,
 			-- but the register assignment may have changed
 			if vinfo.shadow == var then
 				vinfo.reg = V[var].reg
 				vinfo.shadow = nil
 			end
 		end
 	end
 	-- Get precise type for CDATA or attempt to narrow numeric constant
 	if not vtype and type(const) == 'cdata' then
 		vtype = ffi.typeof(const)
 	end
 	V[var] = {reg=reg, const=const, type=vtype}
 	-- Track variable source
 	if V[var].const and type(const) == 'table' then
 		V[var].source = V[var].const.source
 	end
 end

 -- Materialize (or register) a variable in a register
 -- If the register is nil, then the a new register is assigned (if not already assigned)
 local function vreg(var, reg, reserve, vtype)
 	local vinfo = V[var]
 	assert(vinfo, 'VAR '..var..' not registered')
 	vinfo.live_to = code.pc-1
 	if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end
 	reg = reg_alloc(var, reg)
 	-- Materialize variable shadow copy
 	local src = vinfo
 	while src.shadow do src = V[src.shadow] end
 	if reserve then -- luacheck: ignore
 		-- No load to register occurs
 	elseif src.reg then
 		emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0)
 	elseif src.spill then
 		vinfo.spill = src.spill
 		reg_fill(var, reg)
 	elseif src.const then
 		vtype = vtype or src.type
 		if type(src.const) == 'table' and src.const.__base then
 			-- Load pointer type
 			emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0)
 			emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base)
 		elseif type(src.const) == 'table' and src.const.__dissector then
 			-- Load dissector offset (imm32), but keep the constant part (dissector proxy)
 			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0)
 		elseif vtype and ffi.sizeof(vtype) == 8 then
 			-- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
 			emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const))
 			emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16)))
 			vinfo.const = nil -- The variable is live
 		else
 			emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const)
 			vinfo.const = nil -- The variable is live
 		end
 	else assert(false, 'VAR '..var..' has neither register nor constant value') end
 	vinfo.reg = reg
 	vinfo.shadow = nil
 	vinfo.live_from = code.pc-1
 	vinfo.type = vtype or vinfo.type
 	return reg
 end

 -- Copy variable
 local function vcopy(dst, src)
 	if dst == src then return end
 	V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type}
 end

 -- Dereference variable of pointer type
 local function vderef(dst_reg, src_reg, vinfo)
 	-- Dereference map pointers for primitive types
 	-- BPF doesn't allow pointer arithmetics, so use the entry value
 	assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable')
 	local vtype = vinfo.const.__dissector
 	local w = ffi.sizeof(vtype)
 	assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes')
 	if dst_reg ~= src_reg then
 		emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0)    -- dst = src
 	end
 	-- Optimize the NULL check away if provably not NULL
 	if not vinfo.source or vinfo.source:find('_or_null', 1, true) then
 		emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0)            -- if (src != NULL)
 	end
 	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) --     dst = *src;
 end

 -- Allocate a space for variable
 local function valloc(size, blank)
 	local base = stack_top
 	assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB')
 	stack_top = stack_top + size
 	-- Align to 8 byte boundary
 	stack_top = math.ceil(stack_top/8)*8
 	-- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK
 	-- so we always need to have memory initialized, remove this when supported
 	if blank then
 		if type(blank) == 'string' then
 			local sp = 0
 			while sp < size do
 				-- TODO: no BPF_ST + BPF_DW instruction yet
 				local as_u32 = ffi.new('uint32_t [1]')
 				local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32))
 				ffi.copy(as_u32, sub, #sub)
 				emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0])
 				sp = sp + ffi.sizeof(as_u32)
 			end
 		elseif type(blank) == 'boolean' then
 			reg_alloc(stackslots, 0)
 			emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
 			for sp = base+8,stack_top,8 do
 				emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0)
 			end
 		else error('NYI: will with unknown type '..type(blank)) end
 	end
 	return stack_top
 end

 -- Turn variable into scalar in register (or constant)
 local function vscalar(a, w)
 	assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8')
 	local src_reg
 	-- If source is a pointer, we must dereference it first
 	if cdef.isptr(V[a].type) then
 		src_reg = vreg(a)
 		local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register
 		emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0)
 		vderef(tmp_reg, tmp_reg, V[a])
 		src_reg = tmp_reg -- Materialize and dereference it
 	-- Source is a value on stack, we must load it first
 	elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then
 		src_reg = vreg(a)
 		emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0)
 		V[a].type = V[a].const.__dissector
 		V[a].const = nil -- Value is dereferenced
 	-- If source is an imm32 number, avoid register load
 	elseif type(V[a].const) == 'number' and w < 8 then
 		return nil, V[a].const
 	-- Load variable from any other source
 	else
 		src_reg = vreg(a)
 	end

 	return src_reg, nil
 end

 -- Emit compensation code at the end of basic block to unify variable set layout on all block exits
 -- 1. we need to free registers by spilling
 -- 2. fill registers to match other exits from this BB
 local function bb_end(Vcomp)
 	for i,v in pairs(V) do
 		if Vcomp[i] and Vcomp[i].spill and not v.spill then
 			-- Materialize constant or shadowing variable to be able to spill
 			if not v.reg and (v.shadow or cdef.isimmconst(v)) then
 				vreg(i)
 			end
 			reg_spill(i)
 		end
 	end
 	for i,v in pairs(V) do
 		if Vcomp[i] and Vcomp[i].reg and not v.reg then
 			vreg(i, Vcomp[i].reg)
 		end
 		-- Compensate variable metadata change
 		if Vcomp[i] and Vcomp[i].source then
 			V[i].source = Vcomp[i].source
 		end
 	end
 end

 local function CMP_STR(a, b, op)
 	assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=')
 	-- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP
 	-- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ...
 	--     EQ(a,b) <=> X == 0
 	-- This could be optimised by placing early exits by rewriter in second phase for long strings
 	local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type))
 	local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1)
 	local sp = 0
 	emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0)
 	while sp < size do
 		-- Load string chunk as imm32
 		local as_u32 = ffi.new('uint32_t [1]')
 		local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32))
 		ffi.copy(as_u32, sub, #sub)
 		-- TODO: make this faster by interleaved load/compare steps with DW length
 		emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0)
 		emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0])
 		emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0)
 		sp = sp + ffi.sizeof(as_u32)
 	end
 	emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0)
 	code.seen_cmp = code.pc-1
 end

 local function CMP_REG(a, b, op)
 	-- Fold compile-time expressions
 	if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
 		code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER
 	else
 		-- Comparison against compile-time string or stack memory
 		if V[b].const and type(V[b].const) == 'string' then
 			return CMP_STR(a, V[b].const, op)
 		end
 		-- The 0xFFFF target here has no significance, it's just a placeholder for
 		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
 		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
 		local a_reg, b_reg = vreg(a), vreg(b)
 		emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0)
 		code.seen_cmp = code.pc-1
 	end
 end

 local function CMP_IMM(a, b, op)
 	local c = V[a].const
 	if c and not is_proxy(c) then -- Fold compile-time expressions
 		code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER
 	else
 		-- Convert imm32 to number
 		if type(b) == 'string' then
 			if     #b == 1 then b = b:byte()
 			elseif cdef.isptr(V[a].type) then
 				-- String comparison between stack/constant string
 				return CMP_STR(a, b, op)
 			elseif #b <= 4 then
 				-- Convert to u32 with network byte order
 				local imm = ffi.new('uint32_t[1]')
 				ffi.copy(imm, b, #b)
 				b = builtins.hton(imm[0])
 			else error('NYI: compare register with string, where #string > sizeof(u32)') end
 		end
 		-- The 0xFFFF target here has no significance, it's just a placeholder for
 		-- compiler to replace it's absolute offset to LJ bytecode insn with a relative
 		-- offset in BPF program code, verifier will accept only programs with valid JMP targets
 		local reg = vreg(a)
 		emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b)
 		code.seen_cmp = code.pc-1
 		-- Remember NULL pointer checks as BPF prohibits pointer comparisons
 		-- and repeated checks wouldn't pass the verifier, only comparisons
 		-- against constants are checked.
 		if op == 'JEQ' and tonumber(b) == 0 and V[a].source then
 			local pos = V[a].source:find('_or_null', 1, true)
 			if pos then
 				code.seen_null_guard = a
 			end
 		-- Inverse NULL pointer check (if a ~= nil)
 		elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then
 			local pos = V[a].source:find('_or_null', 1, true)
 			if pos then
 				code.seen_null_guard = a
 				code.seen_null_guard_inverse = true
 			end
 		end
 	end
 end

 local function ALU_IMM(dst, a, b, op)
 	-- Fold compile-time expressions
 	if V[a].const and not is_proxy(V[a].const) then
 			assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
 			vset(dst, nil, const_expr[op](V[a].const, b))
 	-- Now we need to materialize dissected value at DST, and add it
 	else
 		vcopy(dst, a)
 		local dst_reg = vreg(dst)
 		if cdef.isptr(V[a].type) then
 			vderef(dst_reg, dst_reg, V[a])
 			V[dst].type = V[a].const.__dissector
 		else
 			V[dst].type = V[a].type
 		end
 		emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b)
 	end
 end

 local function ALU_REG(dst, a, b, op)
 	-- Fold compile-time expressions
 	if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then
 		assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric')
 		assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric')
 		if type(op) == 'string' then op = const_expr[op] end
 		vcopy(dst, a)
 		V[dst].const = op(V[a].const, V[b].const)
 	else
 		local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations
 		if b and cdef.isptr(V[b].type) then
 			-- We have to allocate a temporary register for dereferencing to preserve
 			-- pointer in source variable that MUST NOT be altered
 			reg_alloc(stackslots, 2)
 			vderef(2, src_reg, V[b])
 			src_reg = 2
 		end
 		vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B
 		local dst_reg = vreg(dst)
 		if cdef.isptr(V[a].type) then
 			vderef(dst_reg, dst_reg, V[a])
 			V[dst].type = V[a].const.__dissector
 		end
 		emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0)
 		V[stackslots].reg = nil  -- Free temporary registers
 	end
 end

 local function ALU_IMM_NV(dst, a, b, op)
 	-- Do DST = IMM(a) op VAR(b) where we can't invert because
 	-- the registers are u64 but immediates are u32, so complement
 	-- arithmetics wouldn't work
 	vset(stackslots+1, nil, a)
 	ALU_REG(dst, stackslots+1, b, op)
 end

 local function LD_ABS(dst, w, off)
 	assert(off, 'LD_ABS called without offset')
 	if w < 8 then
 		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
 		emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off)
 		if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
 			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
 		end
 	elseif w == 8 then
 		-- LD_ABS|IND prohibits DW, we need to do two W loads and combine them
 		local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0
 		emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4)
 		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
 			emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32)
 		end
 		ALU_IMM(stackslots, stackslots, 32, 'LSH')
 		local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable
 		emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off)
 		if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
 			emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32)
 		end
 		ALU_REG(dst, dst, stackslots, 'OR')
 		V[stackslots].reg = nil -- Free temporary registers
 	else
 		assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported')
 	end
 end

 local function LD_IND(dst, src, w, off)
 	local src_reg = vreg(src) -- Must materialize first in case dst == src
 	local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0
 	emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0)
 	if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse
 		emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8)
 	end
 end

 local function LD_MEM(dst, src, w, off)
 	local src_reg = vreg(src) -- Must materialize first in case dst == src
 	local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0
 	emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0)
 end

 -- @note: This is specific now as it expects registers reserved
 local function LD_IMM_X(dst_reg, src_type, imm, w)
 	if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32))
 		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm))
 		-- Must shift in two steps as bit.lshift supports [0..31]
 		emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16)))
 	else
 		emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm)
 	end
 end

 local function BUILTIN(func, ...)
 	local builtin_export = {
 		-- Compiler primitives (work with variable slots, emit instructions)
 		V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit,
 		reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width,
 		-- Extensions and helpers (use with care)
 		LD_IMM_X = LD_IMM_X,
 	}
 	func(builtin_export, ...)
 end

 local function LOAD(dst, src, off, vtype)
 	local base = V[src].const
 	assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector')
 	assert(V[src].source, 'NYI: load() on variable with unknown source')
 	-- Cast to different type if requested
 	vtype = vtype or base.__dissector
 	local w = ffi.sizeof(vtype)
 	assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w))
 	-- Packet access with a dissector (use BPF_LD)
 	if V[src].source:find('ptr_to_pkt', 1, true) then
 		if base.off then -- Absolute address to payload
 			LD_ABS(dst, w, off + base.off)
 		else -- Indirect address to payload
 			LD_IND(dst, src, w, off)
 		end
 	-- Direct access to first argument (skb fields, pt regs, ...)
 	elseif V[src].source:find('ptr_to_ctx', 1, true) then
 		LD_MEM(dst, src, w, off)
 	-- Direct skb access with a dissector (use BPF_MEM)
 	elseif V[src].source:find('ptr_to_skb', 1, true) then
 		LD_MEM(dst, src, w, off)
 	-- Pointer to map-backed memory (use BPF_MEM)
 	elseif V[src].source:find('ptr_to_map_value', 1, true) then
 		LD_MEM(dst, src, w, off)
 	-- Indirect read using probe (uprobe or kprobe, uses helper)
 	elseif V[src].source:find('ptr_to_probe', 1, true) then
 		BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off)
 		V[dst].source = V[src].source -- Builtin handles everything
 	else
 		error('NYI: load() on variable from ' .. V[src].source)
 	end
 	V[dst].type = vtype
 	V[dst].const = nil -- Dissected value is not constant anymore
 end

 local function CALL(a, b, d)
 	assert(b-1 <= 1, 'NYI: CALL with >1 return values')
 	-- Perform either compile-time, helper, or builtin
 	local func = V[a].const
 	-- Gather all arguments and check if they're constant
 	local args, const, nargs = {}, true, d - 1
 	for i = a+1, a+d-1 do
 		table.insert(args, V[i].const)
 		if not V[i].const or is_proxy(V[i].const) then const = false end
 	end
 	local builtin = builtins[func]
 	if not const or nargs == 0 then
 		if builtin and type(builtin) == 'function' then
 			args = {a}
 			for i = a+1, a+nargs do table.insert(args, i) end
 			BUILTIN(builtin, unpack(args))
 		elseif V[a+2] and V[a+2].const then -- var OP imm
 			ALU_IMM(a, a+1, V[a+2].const, builtin)
 		elseif nargs <= 2 then              -- var OP var
 			ALU_REG(a, a+1, V[a+2] and a+2, builtin)
 		else
 			error('NYI: CALL non-builtin with 3 or more arguments')
 		end
 	-- Call on dissector implies slice retrieval
 	elseif type(func) == 'table' and func.__dissector then
 		assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments')
 		assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant')
 		local off = V[a+1].const
 		local vtype = builtins.width_type(V[a+2].const - off)
 		-- Access to packet via packet (use BPF_LD)
 		if V[a].source and V[a].source:find('ptr_to_', 1, true) then
 			LOAD(a, a, off, vtype)
 		else
 			error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown'))
 		end
 	-- Strict builtins cannot be expanded on compile-time
 	elseif builtins_strict[func] and builtin then
 		args = {a}
 		for i = a+1, a+nargs do table.insert(args, i) end
 		BUILTIN(builtin, unpack(args))
 	-- Attempt compile-time call expansion (expects all argument compile-time known)
 	else
 		assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant')
 		V[a].const = func(unpack(args))
 	end
 end

 local function MAP_INIT(map_var, key, imm)
 	local map = V[map_var].const
 	vreg(map_var, 1, true, ffi.typeof('uint64_t'))
 	-- Reserve R1 and load ptr for process-local map fd
 	LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type))
 	V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation
 	-- Reserve R2 and load R2 = key pointer
 	local key_size = ffi.sizeof(map.key_type)
 	local w = const_width[key_size] or BPF.DW
 	local pod_type = const_width[key_size]
 	local sp = stack_top + key_size -- Must use stack below spill slots
 	-- Store immediate value on stack
 	reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable)
 	local key_base = key and V[key].const
 	imm = imm or key_base
 	if imm and (not key or not is_proxy(key_base)) then
 		assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8')
 		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm)
 	-- Key is in register, spill it
 	elseif V[key].reg and pod_type then
 		if cdef.isptr(V[key].type) then
 			-- There is already pointer in register, dereference before spilling
 			emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0)
 			emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0)
 		else -- Variable in register is POD, spill it on the stack
 			emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0)
 		end
 	-- Key is spilled from register to stack
 	elseif V[key].spill then
 		sp = V[key].spill
 	-- Key is already on stack, write to base-relative address
 	elseif key_base.__base then
 		assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type')
 		sp = key_base.__base
 	else
 		error('VAR '..key..' is neither const-expr/register/stack/spilled')
 	end
 	-- If [FP+K] addressing, emit it
 	if sp then
 		emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0)
 		emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp)
 	end
 end

 local function MAP_GET(dst, map_var, key, imm)
 	local map = V[map_var].const
 	MAP_INIT(map_var, key, imm)
 	-- Flag as pointer type and associate dissector for map value type
 	vreg(dst, 0, true, ffi.typeof('uint8_t *'))
 	V[dst].const = {__dissector=map.val_type}
 	V[dst].source = 'ptr_to_map_value_or_null'
 	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem)
 	V[stackslots].reg = nil -- Free temporary registers
 end

 local function MAP_DEL(map_var, key, key_imm)
 	-- Set R0, R1 (map fd, preempt R0)
 	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
 	MAP_INIT(map_var, key, key_imm)
 	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem)
 	V[stackslots].reg = nil -- Free temporary registers
 end

 local function MAP_SET(map_var, key, key_imm, src)
 	local map = V[map_var].const
 	-- Delete when setting nil
 	if V[src].type == ffi.typeof('void') then
 		return MAP_DEL(map_var, key, key_imm)
 	end
 	-- Set R0, R1 (map fd, preempt R0)
 	reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable)
 	MAP_INIT(map_var, key, key_imm)
 	reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable)
 	emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing
 	-- Reserve R3 for value pointer
 	reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable)
 	local val_size = ffi.sizeof(map.val_type)
 	local w = const_width[val_size] or BPF.DW
 	local pod_type = const_width[val_size]
 	-- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value)
 	local sp = stack_top + ffi.sizeof(map.key_type) + val_size
 	sp = sp + (sp % val_size)
 	local base = V[src].const
 	if base and not is_proxy(base) then
 		assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8')
 		emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base)
 	-- Value is in register, spill it
 	elseif V[src].reg and pod_type then
 		-- Value is a pointer, derefernce it and spill it
 		if cdef.isptr(V[src].type) then
 			vderef(3, V[src].reg, V[src])
 			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
 		else
 			emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0)
 		end
 	-- We get a pointer to spilled register on stack
 	elseif V[src].spill then
 		-- If variable is a pointer, we can load it to R3 directly (save "LEA")
 		if cdef.isptr(V[src].type) then
 			reg_fill(src, 3)
 			-- If variable is a stack pointer, we don't have to check it
 			if base.__base then
 				emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
 				return
 			end
 			vderef(3, V[src].reg, V[src])
 			emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0)
 		else
 			sp = V[src].spill
 		end
 	-- Value is already on stack, write to base-relative address
 	elseif base.__base then
 		if val_size ~= ffi.sizeof(V[src].type) then
 			local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d',
 				src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type))
 			error(err)
 		end
 		sp = base.__base
 	-- Value is constant, materialize it on stack
 	else
 		error('VAR '.. src ..' is neither const-expr/register/stack/spilled')
 	end
 	emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0)
 	emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp)
 	emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem)
 	V[stackslots].reg = nil -- Free temporary registers
 end

 -- Finally - this table translates LuaJIT bytecode into code emitter actions.
 local BC = {
 	-- Constants
 	KNUM = function(a, _, c, _) -- KNUM
 		if c < 2147483648 then
 			vset(a, nil, c, ffi.typeof('int32_t'))
 		else
 			vset(a, nil, c, ffi.typeof('uint64_t'))
 		end
 	end,
 	KSHORT = function(a, _, _, d) -- KSHORT
 		vset(a, nil, d, ffi.typeof('int16_t'))
 	end,
 	KCDATA = function(a, _, c, _) -- KCDATA
 		-- Coerce numeric types if possible
 		local ct = ffi.typeof(c)
 		if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then
 			vset(a, nil, c, ct)
 		elseif tonumber(c) ~= nil then
 			-- TODO: this should not be possible
 			vset(a, nil, tonumber(c), ct)
 		else
 			error('NYI: cannot use CDATA constant of type ' .. ct)
 		end
 	end,
 	KPRI = function(a, _, _, d) -- KPRI
 		-- KNIL is 0, must create a special type to identify it
 		local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t')
 		vset(a, nil, (d < 2) and 0 or 1, vtype)
 	end,
 	KSTR = function(a, _, c, _) -- KSTR
 		vset(a, nil, c, ffi.typeof('const char[?]'))
 	end,
 	MOV = function(a, _, _, d) -- MOV var, var
 		vcopy(a, d)
 	end,

 	-- Comparison ops
 	-- Note: comparisons are always followed by JMP opcode, that
 	--       will fuse following JMP to JMP+CMP instruction in BPF
 	-- Note:  we're narrowed to integers, so operand/operator inversion is legit
 	ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted)
 	ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d)
 	ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d)
 	ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d)
 	ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d)
 	ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c))
 	ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c))
 	ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c)
 	ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c)
 	IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d)
 	ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d)
 	ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c)
 	-- Binary operations with RHS constants
 	ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end,
 	SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end,
 	MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end,
 	DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end,
 	MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end,
 	-- Binary operations with LHS constants
 	-- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative
 	ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV
 	MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV
 	SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV
 	DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV
 	-- Binary operations between registers
 	ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end,
 	SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end,
 	MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end,
 	DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end,
 	MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end,
 	-- Strings
 	CAT = function(a, b, _, d) -- CAT A = B ~ D
 		assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions')
 		assert(type(V[b].const) == 'string' and type(V[d].const) == 'string',
 			'NYI: CAT only works on compile-time strings')
 		vset(a, nil, V[b].const .. V[d].const)
 	end,
 	-- Tables
 	GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c])
 		if env[c] ~= nil then
 			vset(a, nil, env[c])
 		else error(string.format("undefined global '%s'", c)) end
 	end,
 	UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c])
 		if env[c] ~= nil then
 			vset(a, nil, env[c])
 		else error(string.format("undefined upvalue '%s'", c)) end
 	end,
 	TSETB = function (a, b, _, d) -- TSETB (B[D] = A)
 		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
 		local vinfo = V[b].const
 		if vinfo.__map then -- BPF map read (constant)
 			return MAP_SET(b, nil, d, a) -- D is literal
 		elseif vinfo.__dissector then
 			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
 			local w = ffi.sizeof(vinfo.__dissector)
 			-- TODO: support vectorized moves larger than register width
 			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
 			local src_reg, const = vscalar(a, w)
 			-- If changing map value, write to absolute address + offset
 			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
 				local dst_reg = vreg(b)
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' then
 					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0)
 				end
 			-- Table is already on stack, write to vinfo-relative address
 			elseif vinfo.__base then
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' then
 					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0)
 				end
 			else
 				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
 			end
 		elseif vinfo and vinfo and V[a].const then
 			vinfo[V[d].const] = V[a].const
 		else
 			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
 		end
 	end,
 	TSETV = function (a, b, _, d) -- TSETV (B[D] = A)
 		assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
 		local vinfo = V[b].const
 		if vinfo.__map then -- BPF map read (constant)
 			return MAP_SET(b, d, nil, a) -- D is variable
 		elseif vinfo.__dissector then
 			assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size')
 			local w = ffi.sizeof(vinfo.__dissector)
 			-- TODO: support vectorized moves larger than register width
 			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
 			local src_reg, const = vscalar(a, w)
 			-- If changing map value, write to absolute address + offset
 			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
 				-- Calculate variable address from two registers
 				local tmp_var = stackslots + 1
 				vset(tmp_var, nil, d)
 				ALU_REG(tmp_var, tmp_var, b, 'ADD')
 				local dst_reg = vreg(tmp_var)
 				V[tmp_var].reg = nil -- Only temporary allocation
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' and w < 8 then
 					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0)
 				end
 			-- Table is already on stack, write to vinfo-relative address
 			elseif vinfo.__base then
 				-- Calculate variable address from two registers
 				local tmp_var = stackslots + 1
 				vcopy(tmp_var, d)                       -- Element position
 				if w > 1 then
 					ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size
 				end
 				local dst_reg = vreg(tmp_var)           -- add R10 (stack pointer)
 				emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0)
 				V[tmp_var].reg = nil -- Only temporary allocation
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' and w < 8 then
 					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0)
 				end
 			else
 				error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
 			end
 		elseif vinfo and V[d].const and V[a].const then
 			vinfo[V[d].const] = V[a].const
 		else
 			error('NYI: B[D] where B is not Lua table, BPF map, or pointer')
 		end
 	end,
 	TSETS = function (a, b, c, _) -- TSETS (B[C] = A)
 		assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer')
 		local base = V[b].const
 		if base.__dissector then
 			local ofs,bpos = ffi.offsetof(base.__dissector, c)
 			assert(not bpos, 'NYI: B[C] = A, where C is a bitfield')
 			local w = builtins.sizeofattr(base.__dissector, c)
 			-- TODO: support vectorized moves larger than register width
 			assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8')
 			local src_reg, const = vscalar(a, w)
 			-- If changing map value, write to absolute address + offset
 			if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then
 				local dst_reg = vreg(b)
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' and w < 8 then
 					emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0)
 				end
 			-- Table is already on stack, write to base-relative address
 			elseif base.__base then
 				-- Optimization: immediate values (imm32) can be stored directly
 				if type(const) == 'number' and w < 8 then
 					emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const)
 				else
 					emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0)
 				end
 			else
 				error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
 			end
 		elseif V[a].const then
 			base[c] = V[a].const
 		else
 			error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
 		end
 	end,
 	TGETB = function (a, b, _, d) -- TGETB (A = B[D])
 		local base = V[b].const
 		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
 		if a ~= b then vset(a) end
 		if base.__map then -- BPF map read (constant)
 			MAP_GET(a, b, nil, d)
 		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
 		elseif V[b].source and V[b].source:find('ptr_to_') then
 			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
 			LOAD(a, b, d, vtype)
 		-- Specialise PTR[0] as dereference operator
 		elseif cdef.isptr(V[b].type) and d == 0 then
 			vcopy(a, b)
 			local dst_reg = vreg(a)
 			vderef(dst_reg, dst_reg, V[a])
 			V[a].type = V[a].const.__dissector
 		else
 			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
 		end
 	end,
 	TGETV = function (a, b, _, d) -- TGETV (A = B[D])
 		local base = V[b].const
 		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
 		if a ~= b then vset(a) end
 		if base.__map then -- BPF map read
 			MAP_GET(a, b, d)
 		-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
 		elseif V[b].source and V[b].source:find('ptr_to_') then
 			local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t')
 			LOAD(a, b, d, vtype)
 		-- Constant dereference
 		elseif type(V[d].const) == 'number' then
 			V[a].const = base[V[d].const]
 		else
 			error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference')
 		end
 	end,
 	TGETS = function (a, b, c, _) -- TGETS (A = B[C])
 		local base = V[b].const
 		assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map')
 		if a ~= b then vset(a) end
 		if base.__dissector then
 			local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
 			-- Resolve table key using metatable
 			if not ofs and type(base.__dissector[c]) == 'string' then
 				c = base.__dissector[c]
 				ofs,bpos,bsize = ffi.offsetof(base.__dissector, c)
 			end
 			if not ofs and proto[c] then -- Load new dissector on given offset
 				BUILTIN(proto[c], a, b, c)
 			else
 				-- Loading register from offset is a little bit tricky as there are
 				-- several data sources and value loading modes with different restrictions
 				-- such as checking pointer values for NULL compared to using stack.
 				assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists')
 				if a ~= b then vset(a) end
 				-- Dissected value is probably not constant anymore
 				local new_const = nil
 				local w, atype = builtins.sizeofattr(base.__dissector, c)
 				-- [SP+K] addressing using R10 (stack pointer)
 				-- Doesn't need to be checked for NULL
 				if base.__base and base.__base > 0 then
 					if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset
 						new_const = {__base = base.__base-ofs}
 					else
 						local dst_reg = vreg(a, nil, true)
 						emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0)
 					end
 				-- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM)
 				elseif V[b].source and V[b].source:find('ptr_to_') then
 					LOAD(a, b, ofs, atype)
 				else
 					error('NYI: B[C] where B is not Lua table, BPF map, or pointer')
 				end
 				-- Bitfield, must be further narrowed with a bitmask/shift
 				if bpos then
 					local mask = 0
 					for i=bpos+1,bpos+bsize do
 						mask = bit.bor(mask, bit.lshift(1, w*8-i))
 					end
 					emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask)
 					-- Free optimization: single-bit values need just boolean result
 					if bsize > 1 then
 						local shift = w*8-bsize-bpos
 						if shift > 0 then
 							emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift)
 						end
 					end
 				end
 				V[a].type = atype
 				V[a].const = new_const
 				V[a].source = V[b].source
 				-- Track direct access to skb data
 				-- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access"
 				if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then
 					-- Direct access to skb uses skb->data and skb->data_end
 					-- which are encoded as u32, but are actually pointers
 					if c == 'data' or c == 'data_end' then
 						V[a].const = {__dissector = ffi.typeof('uint8_t')}
 						V[a].source = 'ptr_to_skb'
 					end
 				end
 			end
 		else
 			V[a].const = base[c]
 		end
 	end,
 	-- Loops and branches
 	CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES)
 		-- NYI: Support single result only
 		CALL(a, b, d+2)
 	end,
 	CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1)
 		CALL(a, b, d)
 	end,
 	JMP = function (a, _, c, _) -- JMP
 		-- Discard unused slots after jump
 		for i, _ in pairs(V) do
 			if i >= a and i < stackslots then
 				V[i] = nil
 			end
 		end
 		-- Cross basic block boundary if the jump target isn't provably unreachable
 		local val = code.fixup[c] or {}
 		if code.seen_cmp and code.seen_cmp ~= ALWAYS then
 			if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup
 				-- Store previous CMP insn for reemitting after compensation code
 				local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1])
 				code.pc = code.pc - 1
 				-- First branch point, emit compensation code
 				local Vcomp = Vstate[c]
 				if not Vcomp then
 					-- Select scratch register (R0-5) that isn't used as operand
 					-- in the CMP instruction, as the variable may not be live, after
 					-- the JMP, but it may be used in the JMP+CMP instruction itself
 					local tmp_reg = 0
 					for reg = 0, 5 do
 						if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then
 							tmp_reg = reg
 							break
 						end
 					end
 					-- Force materialization of constants at the end of BB
 					for i, v in pairs(V) do
 						if not v.reg and cdef.isimmconst(v) then
 							vreg(i, tmp_reg) -- Load to TMP register (not saved)
 							reg_spill(i) -- Spill caller-saved registers
 						end
 					end
 					-- Record variable state
 					Vstate[c] = V
 					Vcomp = V
 					V = table_copy(V)
 				-- Variable state already set, emit specific compensation code
 				else
 					bb_end(Vcomp)
 				end
 				-- Record pointer NULL check from condition
 				-- If the condition checks pointer variable against NULL,
 				-- we can assume it will not be NULL in the fall-through block
 				if code.seen_null_guard then
 					local var = code.seen_null_guard
 					-- The null guard can have two forms:
 					--   if x == nil then goto
 					--   if x ~= nil then goto
 					-- First form guarantees that the variable will be non-nil on the following instruction
 					-- Second form guarantees that the variable will be non-nil at the jump target
 					local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var]
 					if vinfo.source then
 						local pos = vinfo.source:find('_or_null', 1, true)
 						if pos then
 							vinfo.source = vinfo.source:sub(1, pos - 1)
 						end
 					end
 				end
 				-- Reemit CMP insn
 				emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm)
 				-- Fuse JMP into previous CMP opcode, mark JMP target for fixup
 				-- as we don't knot the relative offset in generated code yet
 				table.insert(val, code.pc-1)
 				code.fixup[c] = val
 			end
 			code.seen_cmp = nil
 			code.seen_null_guard = nil
 			code.seen_null_guard_inverse = nil
 		elseif c == code.bc_pc + 1 then -- luacheck: ignore 542
 			-- Eliminate jumps to next immediate instruction
 			-- e.g. 0002    JMP      1 => 0003
 		else
 			-- We need to synthesise a condition that's always true, however
 			-- BPF prohibits pointer arithmetic to prevent pointer leaks
 			-- so we have to clear out one register and use it for cmp that's always true
 			local dst_reg = reg_alloc(stackslots)
 			V[stackslots].reg = nil -- Only temporary allocation
 			-- First branch point, emit compensation code
 			local Vcomp = Vstate[c]
 			if not Vcomp then
 				-- Force materialization of constants at the end of BB
 				for i, v in pairs(V) do
 					if not v.reg and cdef.isimmconst(v) then
 						vreg(i, dst_reg) -- Load to TMP register (not saved)
 						reg_spill(i) -- Spill caller-saved registers
 					end
 				end
 				-- Record variable state
 				Vstate[c] = V
 				V = table_copy(V)
 			-- Variable state already set, emit specific compensation code
 			else
 				bb_end(Vcomp)
 			end
 			emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0)
 			emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0)
 			table.insert(val, code.pc-1) -- Fixup JMP target
 			code.reachable = false -- Code following the JMP is not reachable
 			code.fixup[c] = val
 		end
 	end,
 	RET1 = function (a, _, _, _) -- RET1
 		-- Free optimisation: spilled variable will not be filled again
 		for i, v in pairs(V) do
 			if i ~= a then v.reg = nil end
 		end
 		if V[a].reg ~= 0 then vreg(a, 0) end
 		-- Convenience: dereference pointer variables
 		-- e.g. 'return map[k]' will return actual map value, not pointer
 		if cdef.isptr(V[a].type) then
 			vderef(0, 0, V[a])
 		end
 		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
 		code.reachable = false
 	end,
 	RET0 = function (_, _, _, _) -- RET0
 		emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0)
 		emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0)
 		code.reachable = false
 	end,
 	compile = function ()
 		return code
 	end
 }

 -- Composite instructions
 function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1)
 	CALL(a, 1, d)
 	BC.RET1(a)
 end

 -- Always initialize R6 with R1 context
 emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0)
 -- Register R6 as context variable (first argument)
 if params and params > 0 then
 	vset(0, 6, param_types[1] or proto.skb)
 	assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo
 end
 -- Register tmpvars
 vset(stackslots)
 vset(stackslots+1)
 return setmetatable(BC, {
 	__index = function (_, k, _)
 		if type(k) == 'number' then
 			local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6)
 			error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str))
 		end
 	end,
 	__call = function (t, op, a, b, c, d)
 		code.bc_pc = code.bc_pc + 1
 		-- Exitting BB straight through, emit compensation code
 		if Vstate[code.bc_pc] then
 			if code.reachable then
 				-- Instruction is reachable from previous line
 				-- so we must make the variable allocation consistent
 				-- with the variable allocation at the jump source
 				-- e.g. 0001 x:R0 = 5
 				--      0002 if rand() then goto 0005
 				--      0003 x:R0 -> x:stack
 				--      0004 y:R0 = 5
 				--      0005 x:? = 10 <-- x was in R0 before jump, and stack after jump
 				bb_end(Vstate[code.bc_pc])
 			else
 				-- Instruction isn't reachable from previous line, restore variable layout
 				-- e.g. RET or condition-less JMP on previous line
 				V = table_copy(Vstate[code.bc_pc])
 			end
 		end
 		-- Perform fixup of jump targets
 		-- We need to do this because the number of consumed and emitted
 		-- bytecode instructions is different
 		local fixup = code.fixup[code.bc_pc]
 		if fixup ~= nil then
 			-- Patch JMP source insn with relative offset
 			for _,pc in ipairs(fixup) do
 				code.insn[pc].off = code.pc - 1 - pc
 			end
 			code.fixup[code.bc_pc] = nil
 			code.reachable = true
 		end
 		-- Execute
 		if code.reachable then
 			assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d))
 			return t[op](a, b, c, d)
 		end
 	end,
 })
 end

 -- Emitted code dump
 local function dump_mem(cls, ins, _, fuse)
 	-- This is a very dense MEM instruction decoder without much explanation
 	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
 	local mode = bit.band(ins.code, 0xe0)
 	if mode == BPF.XADD then cls = 5 end -- The only mode
 	local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'}
 	local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'}
 	local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)]
 	local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed
 	local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off)
 	local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg
 	if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end
 	if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end
 	if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end
 	return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src)
 end

 local function dump_alu(cls, ins, pc)
 	local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' }
 	local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'}
 	local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns',
 					'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes',
 					'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid',
 					'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop',
 					'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm',
 					'perf_event_output', 'skb_load_bytes'}
 	local op = 0
 	-- This is a very dense ALU instruction decoder without much explanation
 	-- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format
 	for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end
 	local name = (cls == 5) and jmp[op] or alu[op]
 	local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm
 	local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or ''
 	if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end
 	return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target)
 end

 local function dump_string(code, off, hide_counter)
 	if not code then return end
 	local cls_map = {
 		[0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem,
 		[4] = dump_alu, [5] = dump_alu, [7] = dump_alu,
 	}
 	local result = {}
 	local fused = false
 	for i = off or 0, code.pc - 1 do
 		local ins = code.insn[i]
 		local cls = bit.band(ins.code, 0x07)
 		local line = cls_map[cls](cls, ins, i, fused)
 		if hide_counter then
 			table.insert(result, line)
 		else
 			table.insert(result, string.format('%04u\t%s', i, line))
 		end
 		fused = string.find(line, 'LDDW', 1)
 	end
 	return table.concat(result, '\n')
 end

 local function dump(code)
 	if not code then return end
 	print(string.format('-- BPF %s:0-%u', code.insn, code.pc))
 	print(dump_string(code))
 end

 local function compile(prog, params)
 	-- Create code emitter sandbox, include caller locals
 	local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi }
 	-- Include upvalues up to 4 nested scopes back
 	-- the narrower scope overrides broader scope
 	for k = 5, 2, -1 do
 		local i = 1
 		while true do
 			local ok, n, v = pcall(debug.getlocal, k, i)
 			if not ok or not n then break end
 			env[n] = v
 			i = i + 1
 		end
 	end
 	setmetatable(env, {
 		__index = function (_, k)
 			return proto[k] or builtins[k] or _G[k]
 		end
 	})
 	-- Create code emitter and compile LuaJIT bytecode
 	if type(prog) == 'string' then prog = loadstring(prog) end
 	-- Create error handler to print traceback
 	local funci, pc = bytecode.funcinfo(prog), 0
 	local E = create_emitter(env, funci.stackslots, funci.params, params or {})
 	local on_err = function (e)
 			funci = bytecode.funcinfo(prog, pc)
 			local from, to = 0, 0
 			for _ = 1, funci.currentline do
 				from = to
 				to = string.find(funci.source, '\n', from+1, true) or 0
 			end
 			print(funci.loc..':'..string.sub(funci.source, from+1, to-1))
 			print('error: '..e)
 			print(debug.traceback())
 	end
 	for _,op,a,b,c,d in bytecode.decoder(prog) do
 		local ok, _, err = xpcall(E,on_err,op,a,b,c,d)
 		if not ok then
 			return nil, err
 		end
 	end
 	return E:compile()
 end

 -- BPF map interface
 local bpf_map_mt = {
 	__gc = function (map) S.close(map.fd) end,
 	__len = function(map) return map.max_entries end,
 	__index = function (map, k)
 		if type(k) == 'string' then
 			-- Return iterator
 			if k == 'pairs' then
 				return function(t, key)
 					-- Get next key
 					local next_key = ffi.new(ffi.typeof(t.key))
 					local cur_key
 					if key then
 						cur_key = t.key
 						t.key[0] = key
 					else
 						cur_key = ffi.new(ffi.typeof(t.key))
 					end
 					local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key)
 					if not ok then return nil, err end
 					-- Get next value
 					assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val))
 					return next_key[0], map.val[0]
 				end, map, nil
 			-- Read for perf event map
 			elseif k == 'reader' then
 				return function (pmap, pid, cpu, event_type)
 					-- Caller must either specify PID or CPU
 					if not pid or pid < 0 then
 						assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs')
 						pid = -1
 					end
 					-- Create BPF output reader
 					local pe = S.t.perf_event_attr1()
 					pe[0].type = 'software'
 					pe[0].config = 'sw_bpf_output'
 					pe[0].sample_type = 'raw'
 					pe[0].sample_period = 1
 					pe[0].wakeup_events = 1
 					local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1))
 					if not reader then return nil, tostring(err) end
 					-- Register event reader fd in BPF map
 					assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu))
 					pmap[cpu] = reader.fd
 					-- Open memory map and start reading
 					local ok, err = reader:start()
 					assert(ok, tostring(err))
 					ok, err = reader:mmap()
 					assert(ok, tostring(err))
 					return cdef.event_reader(reader, event_type)
 				end
 			-- Signalise this is a map type
 			end
 			return k == '__map'
 		end
 		-- Retrieve key
 		map.key[0] = k
 		local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val)
 		if not ok then return nil, err end
 		return ffi.new(map.val_type, map.val[0])
 	end,
 	__newindex = function (map, k, v)
 		map.key[0] = k
 		if v == nil then
 			return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil)
 		end
 		map.val[0] = v
 		return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val)
 	end,
 }

 -- Linux tracing interface
 local function trace_check_enabled(path)
 	path = path or '/sys/kernel/debug/tracing'
 	if S.statfs(path) then return true end
 	return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?'
 end

 -- Tracepoint interface
 local tracepoint_mt = {
 	__index = {
 		bpf = function (t, prog)
 			if type(prog) ~= 'table' then
 				-- Create protocol parser with source probe
 				prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})})
 			end
 			-- Load the BPF program
 			local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc)
 			assert(prog_fd, tostring(err)..': '..tostring(log))
 			-- Open tracepoint and attach
 			t.reader:setbpf(prog_fd:getfd())
 			table.insert(t.progs, prog_fd)
 			return prog_fd
 		end,
 	}
 }
 -- Open tracepoint
 local function tracepoint_open(path, pid, cpu, group_fd)
 	-- Open tracepoint and compile tracepoint type
 	local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path))
 	local tp_type = assert(cdef.tracepoint_type(path))
 	-- Open tracepoint reader and create interface
 	local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd))
 	return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt)
 end

 local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd)
 	-- Load BPF program
 	if type(prog) ~= 'table' then
 		prog = compile(prog, {proto.pt_regs})
 	end
 	local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc)
 	assert(prog_fd, tostring(err)..': '..tostring(log))
 	-- Open tracepoint and attach
 	local tp, err = S.perf_probe(ptype, pname, pdef, retprobe)
 	if not tp then
 		prog_fd:close()
 		return nil, tostring(err)
 	end
 	local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'})
 	if not reader then
 		prog_fd:close()
 		S.perf_probe(ptype, pname, false)
 		return nil, tostring(err)
 	end
 	local ok, err = reader:setbpf(prog_fd:getfd())
 	if not ok then
 		prog_fd:close()
 		reader:close()
 		S.perf_probe(ptype, pname, false)
 		return nil, tostring(err)..' (kernel version should be at least 4.1)'
 	end
 	-- Create GC closure for reader to close BPF program
 	-- and detach probe in correct order
 	ffi.gc(reader, function ()
 		prog_fd:close()
 		reader:close()
 		S.perf_probe(ptype, pname, false)
 	end)
 	return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype}
 end

 -- Module interface
 return setmetatable({
 	new = create_emitter,
 	dump = dump,
 	dump_string = dump_string,
 	maps = {},
 	map = function (type, max_entries, key_ctype, val_ctype)
 		if not key_ctype then key_ctype = ffi.typeof('uint32_t') end
 		if not val_ctype then val_ctype = ffi.typeof('uint32_t') end
 		if not max_entries then max_entries = 4096 end
 		-- Special case for BPF_MAP_STACK_TRACE
 		if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then
 			key_ctype = ffi.typeof('int32_t')
 			val_ctype = ffi.typeof('struct bpf_stacktrace')
 		end
 		local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries)
 		if not fd then return nil, tostring(err) end
 		local map = setmetatable({
 			max_entries = max_entries,
 			key = ffi.new(ffi.typeof('$ [1]', key_ctype)),
 			val = ffi.new(ffi.typeof('$ [1]', val_ctype)),
 			map_type = S.c.BPF_MAP[type],
 			key_type = key_ctype,
 			val_type = val_ctype,
 			fd = fd:nogc():getfd(),
 		}, bpf_map_mt)
 		return map
 	end,
 	socket = function (sock, prog)
 		-- Expect socket type, if sock is string then assume it's
 		-- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket
 		local ok, err
 		if type(sock) == 'string' then
 			local iface = assert(S.nl.getlink())[sock]
 			assert(iface, sock..' is not interface name')
 			sock, err = S.socket('packet', 'raw')
 			assert(sock, tostring(err))
 			ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index}))
 			assert(ok, tostring(err))
 		elseif type(sock) == 'number' then
 			sock = S.t.fd(sock):nogc()
 		elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore
 			-- No cast required
 		else
 			return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket'
 		end
 		-- Load program and attach it to socket
 		if type(prog) ~= 'table' then
 			prog = compile(prog, {proto.skb})
 		end
 		local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc)
 		assert(prog_fd, tostring(err)..': '..tostring(log))
 		assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd()))
 		return prog_fd, err
 	end,
 	tracepoint = function(tp, prog, pid, cpu, group_fd)
 		assert(trace_check_enabled())
 		-- Return tracepoint instance if no program specified
 		-- this allows free specialisation of arg0 to tracepoint type
 		local probe = tracepoint_open(tp, pid, cpu, group_fd)
 		-- Load the BPF program
 		if prog then
 			probe:bpf(prog)
 		end
 		return probe
 	end,
 	kprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
 		assert(trace_check_enabled())
 		-- Open tracepoint and attach
 		local pname, pdef = tp:match('([^:]+):(.+)')
 		return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
 	end,
 	uprobe = function(tp, prog, retprobe, pid, cpu, group_fd)
 		assert(trace_check_enabled())
 		-- Translate symbol to address
 		local obj, sym_want = tp:match('([^:]+):(.+)')
 		if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end
 		-- Resolve Elf object (no support for anything else)
 		local elf = require('bpf.elf').open(obj)
 		local sym = elf:resolve(sym_want)
 		if not sym then return nil, 'no such symbol' end
 		sym = sym.st_value - elf:loadaddr()
 		local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)),
 		                                         tonumber(ffi.cast('uint32_t', sym)))
 		-- Convert it to expected uprobe format
 		local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr)
 		local pdef = obj..':0x'..sym_addr
 		return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd)
 	end,
 	tracelog = function(path)
 		assert(trace_check_enabled())
 		path = path or '/sys/kernel/debug/tracing/trace_pipe'
 		return io.open(path, 'r')
 	end,
 	ntoh = builtins.ntoh, hton = builtins.hton,
 }, {
 	__call = function (_, prog) return compile(prog) end,
 })