| --[[ |
| Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ]] |
| -- LuaJIT to BPF bytecode compiler. |
| -- |
| -- The code generation phase is currently one-pass and produces: |
| -- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt) |
| -- * Variables with liveness analysis and other meta (spill information, compile-time value) |
| -- |
| -- The code generator optimises as much as possible in single pass: |
| -- * Fold compile-time expressions and constant propagation |
| -- * Basic control flow analysis with dead code elimination (based on compile-time expressions) |
| -- * Single-pass optimistic register allocation |
| -- |
| -- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further |
| -- optimisations such as: |
| -- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used) |
| -- * Common sub-expression elimination (relies on DCE and liveness analysis) |
| -- * Orphan JMP elimination (removing this in first pass would break previous JMP targets) |
| -- * Better register allocation (needs to be recomputed after optimisations) |
| |
| local ffi = require('ffi') |
| local bit = require('bit') |
| local S = require('syscall') |
| local bytecode = require('bpf.ljbytecode') |
| local cdef = require('bpf.cdef') |
| local proto = require('bpf.proto') |
| local builtins = require('bpf.builtins') |
| |
| -- Constants |
| local ALWAYS, NEVER = -1, -2 |
| local BPF = ffi.typeof('struct bpf') |
| local HELPER = ffi.typeof('struct bpf_func_id') |
| |
| -- Symbolic table of constant expressions over numbers |
| local const_expr = { |
| ADD = function (a, b) return a + b end, |
| SUB = function (a, b) return a - b end, |
| DIV = function (a, b) return a / b end, |
| MOD = function (a, b) return a % b end, |
| JEQ = function (a, b) return a == b end, |
| JNE = function (a, b) return a ~= b end, |
| JGE = function (a, b) return a >= b end, |
| JGT = function (a, b) return a > b end, |
| } |
| |
| local const_width = { |
| [1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW, |
| } |
| |
| -- Built-ins that are strict only (never compile-time expandable) |
| local builtins_strict = { |
| [ffi.new] = true, |
| [print] = true, |
| } |
| |
| -- Deep copy a table |
| local function table_copy(t) |
| local copy = {} |
| for n,v in pairs(t) do |
| if type(v) == 'table' then |
| v = table_copy(v) |
| end |
| copy[n] = v |
| end |
| return copy |
| end |
| |
| -- Return true if the constant part is a proxy |
| local function is_proxy(x) |
| return type(x) == 'table' and (x.__dissector or x.__map or x.__base) |
| end |
| |
| -- Create compiler closure |
| local function create_emitter(env, stackslots, params, param_types) |
| |
| local V = {} -- Variable tracking / register allocator |
| local code = { -- Generated code |
| pc = 0, bc_pc = 0, |
| insn = ffi.new('struct bpf_insn[4096]'), |
| fixup = {}, |
| reachable = true, |
| seen_cmp = nil, |
| } |
| local Vstate = {} -- Track variable layout at basic block exits |
| |
| -- Anything below this stack offset is free to use by caller |
| -- @note: There is no tracking memory allocator, so the caller may |
| -- lower it for persistent objects, but such memory will never |
| -- be reclaimed and the caller is responsible for resetting stack |
| -- top whenever the memory below is free to be reused |
| local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t') |
| |
| local function emit(op, dst, src, off, imm) |
| local ins = code.insn[code.pc] |
| ins.code = op |
| ins.dst_reg = dst |
| ins.src_reg = src |
| ins.off = off |
| ins.imm = imm |
| code.pc = code.pc + 1 |
| end |
| |
| local function reg_spill(var) |
| local vinfo = V[var] |
| assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register') |
| vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width) |
| emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0) |
| vinfo.reg = nil |
| end |
| |
| local function reg_fill(var, reg) |
| local vinfo = V[var] |
| assert(reg, 'attempt to fill variable to register but not register is allocated') |
| assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled') |
| emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0) |
| vinfo.reg = reg |
| vinfo.spill = nil |
| end |
| |
| -- Allocate a register (lazy simple allocator) |
| local function reg_alloc(var, reg) |
| -- Specific register requested, must spill/move existing variable |
| if reg then |
| for k,v in pairs(V) do -- Spill any variable that has this register |
| if v.reg == reg and not v.shadow then |
| reg_spill(k) |
| break |
| end |
| end |
| return reg |
| end |
| -- Find free or least recently used slot |
| local last, last_seen, used = nil, 0xffff, 0 |
| for k,v in pairs(V) do |
| if v.reg then |
| if not v.live_to or v.live_to < last_seen then |
| last, last_seen = k, v.live_to or last_seen |
| end |
| used = bit.bor(used, bit.lshift(1, v.reg)) |
| end |
| end |
| -- Attempt to select a free register from R7-R9 (callee saved) |
| local free = bit.bnot(used) |
| if bit.band(free, 0x80) ~= 0 then reg = 7 |
| elseif bit.band(free,0x100) ~= 0 then reg = 8 |
| elseif bit.band(free,0x200) ~= 0 then reg = 9 |
| end |
| -- Select another variable to be spilled |
| if not reg then |
| assert(last) |
| reg = V[last].reg |
| reg_spill(last) |
| end |
| assert(reg, 'VAR '..var..'fill/spill failed') |
| return reg |
| end |
| |
| -- Set new variable |
| local function vset(var, reg, const, vtype) |
| -- Must materialise all variables shadowing this variable slot, as it will be overwritten |
| if V[var] and V[var].reg then |
| for _, vinfo in pairs(V) do |
| -- Shadowing variable MUST share the same type and attributes, |
| -- but the register assignment may have changed |
| if vinfo.shadow == var then |
| vinfo.reg = V[var].reg |
| vinfo.shadow = nil |
| end |
| end |
| end |
| -- Get precise type for CDATA or attempt to narrow numeric constant |
| if not vtype and type(const) == 'cdata' then |
| vtype = ffi.typeof(const) |
| end |
| V[var] = {reg=reg, const=const, type=vtype} |
| -- Track variable source |
| if V[var].const and type(const) == 'table' then |
| V[var].source = V[var].const.source |
| end |
| end |
| |
| -- Materialize (or register) a variable in a register |
| -- If the register is nil, then the a new register is assigned (if not already assigned) |
| local function vreg(var, reg, reserve, vtype) |
| local vinfo = V[var] |
| assert(vinfo, 'VAR '..var..' not registered') |
| vinfo.live_to = code.pc-1 |
| if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end |
| reg = reg_alloc(var, reg) |
| -- Materialize variable shadow copy |
| local src = vinfo |
| while src.shadow do src = V[src.shadow] end |
| if reserve then -- luacheck: ignore |
| -- No load to register occurs |
| elseif src.reg then |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0) |
| elseif src.spill then |
| vinfo.spill = src.spill |
| reg_fill(var, reg) |
| elseif src.const then |
| vtype = vtype or src.type |
| if type(src.const) == 'table' and src.const.__base then |
| -- Load pointer type |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0) |
| emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base) |
| elseif type(src.const) == 'table' and src.const.__dissector then |
| -- Load dissector offset (imm32), but keep the constant part (dissector proxy) |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0) |
| elseif vtype and ffi.sizeof(vtype) == 8 then |
| -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) |
| emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const)) |
| emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16))) |
| vinfo.const = nil -- The variable is live |
| else |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const) |
| vinfo.const = nil -- The variable is live |
| end |
| else assert(false, 'VAR '..var..' has neither register nor constant value') end |
| vinfo.reg = reg |
| vinfo.shadow = nil |
| vinfo.live_from = code.pc-1 |
| vinfo.type = vtype or vinfo.type |
| return reg |
| end |
| |
| -- Copy variable |
| local function vcopy(dst, src) |
| if dst == src then return end |
| V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type} |
| end |
| |
| -- Dereference variable of pointer type |
| local function vderef(dst_reg, src_reg, vinfo) |
| -- Dereference map pointers for primitive types |
| -- BPF doesn't allow pointer arithmetics, so use the entry value |
| assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable') |
| local vtype = vinfo.const.__dissector |
| local w = ffi.sizeof(vtype) |
| assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes') |
| if dst_reg ~= src_reg then |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0) -- dst = src |
| end |
| -- Optimize the NULL check away if provably not NULL |
| if not vinfo.source or vinfo.source:find('_or_null', 1, true) then |
| emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0) -- if (src != NULL) |
| end |
| emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) -- dst = *src; |
| end |
| |
| -- Allocate a space for variable |
| local function valloc(size, blank) |
| local base = stack_top |
| assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB') |
| stack_top = stack_top + size |
| -- Align to 8 byte boundary |
| stack_top = math.ceil(stack_top/8)*8 |
| -- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK |
| -- so we always need to have memory initialized, remove this when supported |
| if blank then |
| if type(blank) == 'string' then |
| local sp = 0 |
| while sp < size do |
| -- TODO: no BPF_ST + BPF_DW instruction yet |
| local as_u32 = ffi.new('uint32_t [1]') |
| local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32)) |
| ffi.copy(as_u32, sub, #sub) |
| emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0]) |
| sp = sp + ffi.sizeof(as_u32) |
| end |
| elseif type(blank) == 'boolean' then |
| reg_alloc(stackslots, 0) |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) |
| for sp = base+8,stack_top,8 do |
| emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0) |
| end |
| else error('NYI: will with unknown type '..type(blank)) end |
| end |
| return stack_top |
| end |
| |
| -- Turn variable into scalar in register (or constant) |
| local function vscalar(a, w) |
| assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8') |
| local src_reg |
| -- If source is a pointer, we must dereference it first |
| if cdef.isptr(V[a].type) then |
| src_reg = vreg(a) |
| local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0) |
| vderef(tmp_reg, tmp_reg, V[a]) |
| src_reg = tmp_reg -- Materialize and dereference it |
| -- Source is a value on stack, we must load it first |
| elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then |
| src_reg = vreg(a) |
| emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0) |
| V[a].type = V[a].const.__dissector |
| V[a].const = nil -- Value is dereferenced |
| -- If source is an imm32 number, avoid register load |
| elseif type(V[a].const) == 'number' and w < 8 then |
| return nil, V[a].const |
| -- Load variable from any other source |
| else |
| src_reg = vreg(a) |
| end |
| |
| return src_reg, nil |
| end |
| |
| -- Emit compensation code at the end of basic block to unify variable set layout on all block exits |
| -- 1. we need to free registers by spilling |
| -- 2. fill registers to match other exits from this BB |
| local function bb_end(Vcomp) |
| for i,v in pairs(V) do |
| if Vcomp[i] and Vcomp[i].spill and not v.spill then |
| -- Materialize constant or shadowing variable to be able to spill |
| if not v.reg and (v.shadow or cdef.isimmconst(v)) then |
| vreg(i) |
| end |
| reg_spill(i) |
| end |
| end |
| for i,v in pairs(V) do |
| if Vcomp[i] and Vcomp[i].reg and not v.reg then |
| vreg(i, Vcomp[i].reg) |
| end |
| -- Compensate variable metadata change |
| if Vcomp[i] and Vcomp[i].source then |
| V[i].source = Vcomp[i].source |
| end |
| end |
| end |
| |
| local function CMP_STR(a, b, op) |
| assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=') |
| -- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP |
| -- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ... |
| -- EQ(a,b) <=> X == 0 |
| -- This could be optimised by placing early exits by rewriter in second phase for long strings |
| local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type)) |
| local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1) |
| local sp = 0 |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0) |
| while sp < size do |
| -- Load string chunk as imm32 |
| local as_u32 = ffi.new('uint32_t [1]') |
| local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32)) |
| ffi.copy(as_u32, sub, #sub) |
| -- TODO: make this faster by interleaved load/compare steps with DW length |
| emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0) |
| emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0]) |
| emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0) |
| sp = sp + ffi.sizeof(as_u32) |
| end |
| emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0) |
| code.seen_cmp = code.pc-1 |
| end |
| |
| local function CMP_REG(a, b, op) |
| -- Fold compile-time expressions |
| if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then |
| code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER |
| else |
| -- Comparison against compile-time string or stack memory |
| if V[b].const and type(V[b].const) == 'string' then |
| return CMP_STR(a, V[b].const, op) |
| end |
| -- The 0xFFFF target here has no significance, it's just a placeholder for |
| -- compiler to replace it's absolute offset to LJ bytecode insn with a relative |
| -- offset in BPF program code, verifier will accept only programs with valid JMP targets |
| local a_reg, b_reg = vreg(a), vreg(b) |
| emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0) |
| code.seen_cmp = code.pc-1 |
| end |
| end |
| |
| local function CMP_IMM(a, b, op) |
| local c = V[a].const |
| if c and not is_proxy(c) then -- Fold compile-time expressions |
| code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER |
| else |
| -- Convert imm32 to number |
| if type(b) == 'string' then |
| if #b == 1 then b = b:byte() |
| elseif cdef.isptr(V[a].type) then |
| -- String comparison between stack/constant string |
| return CMP_STR(a, b, op) |
| elseif #b <= 4 then |
| -- Convert to u32 with network byte order |
| local imm = ffi.new('uint32_t[1]') |
| ffi.copy(imm, b, #b) |
| b = builtins.hton(imm[0]) |
| else error('NYI: compare register with string, where #string > sizeof(u32)') end |
| end |
| -- The 0xFFFF target here has no significance, it's just a placeholder for |
| -- compiler to replace it's absolute offset to LJ bytecode insn with a relative |
| -- offset in BPF program code, verifier will accept only programs with valid JMP targets |
| local reg = vreg(a) |
| emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b) |
| code.seen_cmp = code.pc-1 |
| -- Remember NULL pointer checks as BPF prohibits pointer comparisons |
| -- and repeated checks wouldn't pass the verifier, only comparisons |
| -- against constants are checked. |
| if op == 'JEQ' and tonumber(b) == 0 and V[a].source then |
| local pos = V[a].source:find('_or_null', 1, true) |
| if pos then |
| code.seen_null_guard = a |
| end |
| -- Inverse NULL pointer check (if a ~= nil) |
| elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then |
| local pos = V[a].source:find('_or_null', 1, true) |
| if pos then |
| code.seen_null_guard = a |
| code.seen_null_guard_inverse = true |
| end |
| end |
| end |
| end |
| |
| local function ALU_IMM(dst, a, b, op) |
| -- Fold compile-time expressions |
| if V[a].const and not is_proxy(V[a].const) then |
| assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') |
| vset(dst, nil, const_expr[op](V[a].const, b)) |
| -- Now we need to materialize dissected value at DST, and add it |
| else |
| vcopy(dst, a) |
| local dst_reg = vreg(dst) |
| if cdef.isptr(V[a].type) then |
| vderef(dst_reg, dst_reg, V[a]) |
| V[dst].type = V[a].const.__dissector |
| else |
| V[dst].type = V[a].type |
| end |
| emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b) |
| end |
| end |
| |
| local function ALU_REG(dst, a, b, op) |
| -- Fold compile-time expressions |
| if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then |
| assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') |
| assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric') |
| if type(op) == 'string' then op = const_expr[op] end |
| vcopy(dst, a) |
| V[dst].const = op(V[a].const, V[b].const) |
| else |
| local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations |
| if b and cdef.isptr(V[b].type) then |
| -- We have to allocate a temporary register for dereferencing to preserve |
| -- pointer in source variable that MUST NOT be altered |
| reg_alloc(stackslots, 2) |
| vderef(2, src_reg, V[b]) |
| src_reg = 2 |
| end |
| vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B |
| local dst_reg = vreg(dst) |
| if cdef.isptr(V[a].type) then |
| vderef(dst_reg, dst_reg, V[a]) |
| V[dst].type = V[a].const.__dissector |
| end |
| emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0) |
| V[stackslots].reg = nil -- Free temporary registers |
| end |
| end |
| |
| local function ALU_IMM_NV(dst, a, b, op) |
| -- Do DST = IMM(a) op VAR(b) where we can't invert because |
| -- the registers are u64 but immediates are u32, so complement |
| -- arithmetics wouldn't work |
| vset(stackslots+1, nil, a) |
| ALU_REG(dst, stackslots+1, b, op) |
| end |
| |
| local function LD_ABS(dst, w, off) |
| assert(off, 'LD_ABS called without offset') |
| if w < 8 then |
| local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 |
| emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off) |
| if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse |
| emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) |
| end |
| elseif w == 8 then |
| -- LD_ABS|IND prohibits DW, we need to do two W loads and combine them |
| local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0 |
| emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4) |
| if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse |
| emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32) |
| end |
| ALU_IMM(stackslots, stackslots, 32, 'LSH') |
| local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable |
| emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off) |
| if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse |
| emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32) |
| end |
| ALU_REG(dst, dst, stackslots, 'OR') |
| V[stackslots].reg = nil -- Free temporary registers |
| else |
| assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported') |
| end |
| end |
| |
| local function LD_IND(dst, src, w, off) |
| local src_reg = vreg(src) -- Must materialize first in case dst == src |
| local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 |
| emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0) |
| if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse |
| emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) |
| end |
| end |
| |
| local function LD_MEM(dst, src, w, off) |
| local src_reg = vreg(src) -- Must materialize first in case dst == src |
| local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0 |
| emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0) |
| end |
| |
| -- @note: This is specific now as it expects registers reserved |
| local function LD_IMM_X(dst_reg, src_type, imm, w) |
| if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) |
| emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm)) |
| -- Must shift in two steps as bit.lshift supports [0..31] |
| emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16))) |
| else |
| emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm) |
| end |
| end |
| |
| local function BUILTIN(func, ...) |
| local builtin_export = { |
| -- Compiler primitives (work with variable slots, emit instructions) |
| V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit, |
| reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width, |
| -- Extensions and helpers (use with care) |
| LD_IMM_X = LD_IMM_X, |
| } |
| func(builtin_export, ...) |
| end |
| |
| local function LOAD(dst, src, off, vtype) |
| local base = V[src].const |
| assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector') |
| assert(V[src].source, 'NYI: load() on variable with unknown source') |
| -- Cast to different type if requested |
| vtype = vtype or base.__dissector |
| local w = ffi.sizeof(vtype) |
| assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w)) |
| -- Packet access with a dissector (use BPF_LD) |
| if V[src].source:find('ptr_to_pkt', 1, true) then |
| if base.off then -- Absolute address to payload |
| LD_ABS(dst, w, off + base.off) |
| else -- Indirect address to payload |
| LD_IND(dst, src, w, off) |
| end |
| -- Direct access to first argument (skb fields, pt regs, ...) |
| elseif V[src].source:find('ptr_to_ctx', 1, true) then |
| LD_MEM(dst, src, w, off) |
| -- Direct skb access with a dissector (use BPF_MEM) |
| elseif V[src].source:find('ptr_to_skb', 1, true) then |
| LD_MEM(dst, src, w, off) |
| -- Pointer to map-backed memory (use BPF_MEM) |
| elseif V[src].source:find('ptr_to_map_value', 1, true) then |
| LD_MEM(dst, src, w, off) |
| -- Indirect read using probe (uprobe or kprobe, uses helper) |
| elseif V[src].source:find('ptr_to_probe', 1, true) then |
| BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off) |
| V[dst].source = V[src].source -- Builtin handles everything |
| else |
| error('NYI: load() on variable from ' .. V[src].source) |
| end |
| V[dst].type = vtype |
| V[dst].const = nil -- Dissected value is not constant anymore |
| end |
| |
| local function CALL(a, b, d) |
| assert(b-1 <= 1, 'NYI: CALL with >1 return values') |
| -- Perform either compile-time, helper, or builtin |
| local func = V[a].const |
| -- Gather all arguments and check if they're constant |
| local args, const, nargs = {}, true, d - 1 |
| for i = a+1, a+d-1 do |
| table.insert(args, V[i].const) |
| if not V[i].const or is_proxy(V[i].const) then const = false end |
| end |
| local builtin = builtins[func] |
| if not const or nargs == 0 then |
| if builtin and type(builtin) == 'function' then |
| args = {a} |
| for i = a+1, a+nargs do table.insert(args, i) end |
| BUILTIN(builtin, unpack(args)) |
| elseif V[a+2] and V[a+2].const then -- var OP imm |
| ALU_IMM(a, a+1, V[a+2].const, builtin) |
| elseif nargs <= 2 then -- var OP var |
| ALU_REG(a, a+1, V[a+2] and a+2, builtin) |
| else |
| error('NYI: CALL non-builtin with 3 or more arguments') |
| end |
| -- Call on dissector implies slice retrieval |
| elseif type(func) == 'table' and func.__dissector then |
| assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments') |
| assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant') |
| local off = V[a+1].const |
| local vtype = builtins.width_type(V[a+2].const - off) |
| -- Access to packet via packet (use BPF_LD) |
| if V[a].source and V[a].source:find('ptr_to_', 1, true) then |
| LOAD(a, a, off, vtype) |
| else |
| error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown')) |
| end |
| -- Strict builtins cannot be expanded on compile-time |
| elseif builtins_strict[func] and builtin then |
| args = {a} |
| for i = a+1, a+nargs do table.insert(args, i) end |
| BUILTIN(builtin, unpack(args)) |
| -- Attempt compile-time call expansion (expects all argument compile-time known) |
| else |
| assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant') |
| V[a].const = func(unpack(args)) |
| end |
| end |
| |
| local function MAP_INIT(map_var, key, imm) |
| local map = V[map_var].const |
| vreg(map_var, 1, true, ffi.typeof('uint64_t')) |
| -- Reserve R1 and load ptr for process-local map fd |
| LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type)) |
| V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation |
| -- Reserve R2 and load R2 = key pointer |
| local key_size = ffi.sizeof(map.key_type) |
| local w = const_width[key_size] or BPF.DW |
| local pod_type = const_width[key_size] |
| local sp = stack_top + key_size -- Must use stack below spill slots |
| -- Store immediate value on stack |
| reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable) |
| local key_base = key and V[key].const |
| imm = imm or key_base |
| if imm and (not key or not is_proxy(key_base)) then |
| assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8') |
| emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm) |
| -- Key is in register, spill it |
| elseif V[key].reg and pod_type then |
| if cdef.isptr(V[key].type) then |
| -- There is already pointer in register, dereference before spilling |
| emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0) |
| emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0) |
| else -- Variable in register is POD, spill it on the stack |
| emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0) |
| end |
| -- Key is spilled from register to stack |
| elseif V[key].spill then |
| sp = V[key].spill |
| -- Key is already on stack, write to base-relative address |
| elseif key_base.__base then |
| assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type') |
| sp = key_base.__base |
| else |
| error('VAR '..key..' is neither const-expr/register/stack/spilled') |
| end |
| -- If [FP+K] addressing, emit it |
| if sp then |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0) |
| emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp) |
| end |
| end |
| |
| local function MAP_GET(dst, map_var, key, imm) |
| local map = V[map_var].const |
| MAP_INIT(map_var, key, imm) |
| -- Flag as pointer type and associate dissector for map value type |
| vreg(dst, 0, true, ffi.typeof('uint8_t *')) |
| V[dst].const = {__dissector=map.val_type} |
| V[dst].source = 'ptr_to_map_value_or_null' |
| emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem) |
| V[stackslots].reg = nil -- Free temporary registers |
| end |
| |
| local function MAP_DEL(map_var, key, key_imm) |
| -- Set R0, R1 (map fd, preempt R0) |
| reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) |
| MAP_INIT(map_var, key, key_imm) |
| emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem) |
| V[stackslots].reg = nil -- Free temporary registers |
| end |
| |
| local function MAP_SET(map_var, key, key_imm, src) |
| local map = V[map_var].const |
| -- Delete when setting nil |
| if V[src].type == ffi.typeof('void') then |
| return MAP_DEL(map_var, key, key_imm) |
| end |
| -- Set R0, R1 (map fd, preempt R0) |
| reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) |
| MAP_INIT(map_var, key, key_imm) |
| reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable) |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing |
| -- Reserve R3 for value pointer |
| reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable) |
| local val_size = ffi.sizeof(map.val_type) |
| local w = const_width[val_size] or BPF.DW |
| local pod_type = const_width[val_size] |
| -- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value) |
| local sp = stack_top + ffi.sizeof(map.key_type) + val_size |
| sp = sp + (sp % val_size) |
| local base = V[src].const |
| if base and not is_proxy(base) then |
| assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8') |
| emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base) |
| -- Value is in register, spill it |
| elseif V[src].reg and pod_type then |
| -- Value is a pointer, derefernce it and spill it |
| if cdef.isptr(V[src].type) then |
| vderef(3, V[src].reg, V[src]) |
| emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) |
| else |
| emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0) |
| end |
| -- We get a pointer to spilled register on stack |
| elseif V[src].spill then |
| -- If variable is a pointer, we can load it to R3 directly (save "LEA") |
| if cdef.isptr(V[src].type) then |
| reg_fill(src, 3) |
| -- If variable is a stack pointer, we don't have to check it |
| if base.__base then |
| emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) |
| return |
| end |
| vderef(3, V[src].reg, V[src]) |
| emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) |
| else |
| sp = V[src].spill |
| end |
| -- Value is already on stack, write to base-relative address |
| elseif base.__base then |
| if val_size ~= ffi.sizeof(V[src].type) then |
| local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d', |
| src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type)) |
| error(err) |
| end |
| sp = base.__base |
| -- Value is constant, materialize it on stack |
| else |
| error('VAR '.. src ..' is neither const-expr/register/stack/spilled') |
| end |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0) |
| emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp) |
| emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) |
| V[stackslots].reg = nil -- Free temporary registers |
| end |
| |
| -- Finally - this table translates LuaJIT bytecode into code emitter actions. |
| local BC = { |
| -- Constants |
| KNUM = function(a, _, c, _) -- KNUM |
| if c < 2147483648 then |
| vset(a, nil, c, ffi.typeof('int32_t')) |
| else |
| vset(a, nil, c, ffi.typeof('uint64_t')) |
| end |
| end, |
| KSHORT = function(a, _, _, d) -- KSHORT |
| vset(a, nil, d, ffi.typeof('int16_t')) |
| end, |
| KCDATA = function(a, _, c, _) -- KCDATA |
| -- Coerce numeric types if possible |
| local ct = ffi.typeof(c) |
| if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then |
| vset(a, nil, c, ct) |
| elseif tonumber(c) ~= nil then |
| -- TODO: this should not be possible |
| vset(a, nil, tonumber(c), ct) |
| else |
| error('NYI: cannot use CDATA constant of type ' .. ct) |
| end |
| end, |
| KPRI = function(a, _, _, d) -- KPRI |
| -- KNIL is 0, must create a special type to identify it |
| local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t') |
| vset(a, nil, (d < 2) and 0 or 1, vtype) |
| end, |
| KSTR = function(a, _, c, _) -- KSTR |
| vset(a, nil, c, ffi.typeof('const char[?]')) |
| end, |
| MOV = function(a, _, _, d) -- MOV var, var |
| vcopy(a, d) |
| end, |
| |
| -- Comparison ops |
| -- Note: comparisons are always followed by JMP opcode, that |
| -- will fuse following JMP to JMP+CMP instruction in BPF |
| -- Note: we're narrowed to integers, so operand/operator inversion is legit |
| ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted) |
| ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d) |
| ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d) |
| ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d) |
| ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d) |
| ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c)) |
| ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c)) |
| ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c) |
| ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c) |
| IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d) |
| ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d) |
| ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c) |
| -- Binary operations with RHS constants |
| ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, |
| SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end, |
| MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, |
| DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end, |
| MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end, |
| -- Binary operations with LHS constants |
| -- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative |
| ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV |
| MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV |
| SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV |
| DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV |
| -- Binary operations between registers |
| ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end, |
| SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end, |
| MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end, |
| DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end, |
| MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end, |
| -- Strings |
| CAT = function(a, b, _, d) -- CAT A = B ~ D |
| assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions') |
| assert(type(V[b].const) == 'string' and type(V[d].const) == 'string', |
| 'NYI: CAT only works on compile-time strings') |
| vset(a, nil, V[b].const .. V[d].const) |
| end, |
| -- Tables |
| GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c]) |
| if env[c] ~= nil then |
| vset(a, nil, env[c]) |
| else error(string.format("undefined global '%s'", c)) end |
| end, |
| UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c]) |
| if env[c] ~= nil then |
| vset(a, nil, env[c]) |
| else error(string.format("undefined upvalue '%s'", c)) end |
| end, |
| TSETB = function (a, b, _, d) -- TSETB (B[D] = A) |
| assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| local vinfo = V[b].const |
| if vinfo.__map then -- BPF map read (constant) |
| return MAP_SET(b, nil, d, a) -- D is literal |
| elseif vinfo.__dissector then |
| assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') |
| local w = ffi.sizeof(vinfo.__dissector) |
| -- TODO: support vectorized moves larger than register width |
| assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') |
| local src_reg, const = vscalar(a, w) |
| -- If changing map value, write to absolute address + offset |
| if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then |
| local dst_reg = vreg(b) |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' then |
| emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0) |
| end |
| -- Table is already on stack, write to vinfo-relative address |
| elseif vinfo.__base then |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' then |
| emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0) |
| end |
| else |
| error('NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| end |
| elseif vinfo and vinfo and V[a].const then |
| vinfo[V[d].const] = V[a].const |
| else |
| error('NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| end |
| end, |
| TSETV = function (a, b, _, d) -- TSETV (B[D] = A) |
| assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| local vinfo = V[b].const |
| if vinfo.__map then -- BPF map read (constant) |
| return MAP_SET(b, d, nil, a) -- D is variable |
| elseif vinfo.__dissector then |
| assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') |
| local w = ffi.sizeof(vinfo.__dissector) |
| -- TODO: support vectorized moves larger than register width |
| assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') |
| local src_reg, const = vscalar(a, w) |
| -- If changing map value, write to absolute address + offset |
| if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then |
| -- Calculate variable address from two registers |
| local tmp_var = stackslots + 1 |
| vset(tmp_var, nil, d) |
| ALU_REG(tmp_var, tmp_var, b, 'ADD') |
| local dst_reg = vreg(tmp_var) |
| V[tmp_var].reg = nil -- Only temporary allocation |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' and w < 8 then |
| emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0) |
| end |
| -- Table is already on stack, write to vinfo-relative address |
| elseif vinfo.__base then |
| -- Calculate variable address from two registers |
| local tmp_var = stackslots + 1 |
| vcopy(tmp_var, d) -- Element position |
| if w > 1 then |
| ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size |
| end |
| local dst_reg = vreg(tmp_var) -- add R10 (stack pointer) |
| emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0) |
| V[tmp_var].reg = nil -- Only temporary allocation |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' and w < 8 then |
| emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0) |
| end |
| else |
| error('NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| end |
| elseif vinfo and V[d].const and V[a].const then |
| vinfo[V[d].const] = V[a].const |
| else |
| error('NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| end |
| end, |
| TSETS = function (a, b, c, _) -- TSETS (B[C] = A) |
| assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer') |
| local base = V[b].const |
| if base.__dissector then |
| local ofs,bpos = ffi.offsetof(base.__dissector, c) |
| assert(not bpos, 'NYI: B[C] = A, where C is a bitfield') |
| local w = builtins.sizeofattr(base.__dissector, c) |
| -- TODO: support vectorized moves larger than register width |
| assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') |
| local src_reg, const = vscalar(a, w) |
| -- If changing map value, write to absolute address + offset |
| if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then |
| local dst_reg = vreg(b) |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' and w < 8 then |
| emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0) |
| end |
| -- Table is already on stack, write to base-relative address |
| elseif base.__base then |
| -- Optimization: immediate values (imm32) can be stored directly |
| if type(const) == 'number' and w < 8 then |
| emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const) |
| else |
| emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0) |
| end |
| else |
| error('NYI: B[C] where B is not Lua table, BPF map, or pointer') |
| end |
| elseif V[a].const then |
| base[c] = V[a].const |
| else |
| error('NYI: B[C] where B is not Lua table, BPF map, or pointer') |
| end |
| end, |
| TGETB = function (a, b, _, d) -- TGETB (A = B[D]) |
| local base = V[b].const |
| assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') |
| if a ~= b then vset(a) end |
| if base.__map then -- BPF map read (constant) |
| MAP_GET(a, b, nil, d) |
| -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) |
| elseif V[b].source and V[b].source:find('ptr_to_') then |
| local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') |
| LOAD(a, b, d, vtype) |
| -- Specialise PTR[0] as dereference operator |
| elseif cdef.isptr(V[b].type) and d == 0 then |
| vcopy(a, b) |
| local dst_reg = vreg(a) |
| vderef(dst_reg, dst_reg, V[a]) |
| V[a].type = V[a].const.__dissector |
| else |
| error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') |
| end |
| end, |
| TGETV = function (a, b, _, d) -- TGETV (A = B[D]) |
| local base = V[b].const |
| assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') |
| if a ~= b then vset(a) end |
| if base.__map then -- BPF map read |
| MAP_GET(a, b, d) |
| -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) |
| elseif V[b].source and V[b].source:find('ptr_to_') then |
| local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') |
| LOAD(a, b, d, vtype) |
| -- Constant dereference |
| elseif type(V[d].const) == 'number' then |
| V[a].const = base[V[d].const] |
| else |
| error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') |
| end |
| end, |
| TGETS = function (a, b, c, _) -- TGETS (A = B[C]) |
| local base = V[b].const |
| assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') |
| if a ~= b then vset(a) end |
| if base.__dissector then |
| local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) |
| -- Resolve table key using metatable |
| if not ofs and type(base.__dissector[c]) == 'string' then |
| c = base.__dissector[c] |
| ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) |
| end |
| if not ofs and proto[c] then -- Load new dissector on given offset |
| BUILTIN(proto[c], a, b, c) |
| else |
| -- Loading register from offset is a little bit tricky as there are |
| -- several data sources and value loading modes with different restrictions |
| -- such as checking pointer values for NULL compared to using stack. |
| assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists') |
| if a ~= b then vset(a) end |
| -- Dissected value is probably not constant anymore |
| local new_const = nil |
| local w, atype = builtins.sizeofattr(base.__dissector, c) |
| -- [SP+K] addressing using R10 (stack pointer) |
| -- Doesn't need to be checked for NULL |
| if base.__base and base.__base > 0 then |
| if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset |
| new_const = {__base = base.__base-ofs} |
| else |
| local dst_reg = vreg(a, nil, true) |
| emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0) |
| end |
| -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) |
| elseif V[b].source and V[b].source:find('ptr_to_') then |
| LOAD(a, b, ofs, atype) |
| else |
| error('NYI: B[C] where B is not Lua table, BPF map, or pointer') |
| end |
| -- Bitfield, must be further narrowed with a bitmask/shift |
| if bpos then |
| local mask = 0 |
| for i=bpos+1,bpos+bsize do |
| mask = bit.bor(mask, bit.lshift(1, w*8-i)) |
| end |
| emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask) |
| -- Free optimization: single-bit values need just boolean result |
| if bsize > 1 then |
| local shift = w*8-bsize-bpos |
| if shift > 0 then |
| emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift) |
| end |
| end |
| end |
| V[a].type = atype |
| V[a].const = new_const |
| V[a].source = V[b].source |
| -- Track direct access to skb data |
| -- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access" |
| if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then |
| -- Direct access to skb uses skb->data and skb->data_end |
| -- which are encoded as u32, but are actually pointers |
| if c == 'data' or c == 'data_end' then |
| V[a].const = {__dissector = ffi.typeof('uint8_t')} |
| V[a].source = 'ptr_to_skb' |
| end |
| end |
| end |
| else |
| V[a].const = base[c] |
| end |
| end, |
| -- Loops and branches |
| CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES) |
| -- NYI: Support single result only |
| CALL(a, b, d+2) |
| end, |
| CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1) |
| CALL(a, b, d) |
| end, |
| JMP = function (a, _, c, _) -- JMP |
| -- Discard unused slots after jump |
| for i, _ in pairs(V) do |
| if i >= a and i < stackslots then |
| V[i] = nil |
| end |
| end |
| -- Cross basic block boundary if the jump target isn't provably unreachable |
| local val = code.fixup[c] or {} |
| if code.seen_cmp and code.seen_cmp ~= ALWAYS then |
| if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup |
| -- Store previous CMP insn for reemitting after compensation code |
| local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1]) |
| code.pc = code.pc - 1 |
| -- First branch point, emit compensation code |
| local Vcomp = Vstate[c] |
| if not Vcomp then |
| -- Select scratch register (R0-5) that isn't used as operand |
| -- in the CMP instruction, as the variable may not be live, after |
| -- the JMP, but it may be used in the JMP+CMP instruction itself |
| local tmp_reg = 0 |
| for reg = 0, 5 do |
| if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then |
| tmp_reg = reg |
| break |
| end |
| end |
| -- Force materialization of constants at the end of BB |
| for i, v in pairs(V) do |
| if not v.reg and cdef.isimmconst(v) then |
| vreg(i, tmp_reg) -- Load to TMP register (not saved) |
| reg_spill(i) -- Spill caller-saved registers |
| end |
| end |
| -- Record variable state |
| Vstate[c] = V |
| Vcomp = V |
| V = table_copy(V) |
| -- Variable state already set, emit specific compensation code |
| else |
| bb_end(Vcomp) |
| end |
| -- Record pointer NULL check from condition |
| -- If the condition checks pointer variable against NULL, |
| -- we can assume it will not be NULL in the fall-through block |
| if code.seen_null_guard then |
| local var = code.seen_null_guard |
| -- The null guard can have two forms: |
| -- if x == nil then goto |
| -- if x ~= nil then goto |
| -- First form guarantees that the variable will be non-nil on the following instruction |
| -- Second form guarantees that the variable will be non-nil at the jump target |
| local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var] |
| if vinfo.source then |
| local pos = vinfo.source:find('_or_null', 1, true) |
| if pos then |
| vinfo.source = vinfo.source:sub(1, pos - 1) |
| end |
| end |
| end |
| -- Reemit CMP insn |
| emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm) |
| -- Fuse JMP into previous CMP opcode, mark JMP target for fixup |
| -- as we don't knot the relative offset in generated code yet |
| table.insert(val, code.pc-1) |
| code.fixup[c] = val |
| end |
| code.seen_cmp = nil |
| code.seen_null_guard = nil |
| code.seen_null_guard_inverse = nil |
| elseif c == code.bc_pc + 1 then -- luacheck: ignore 542 |
| -- Eliminate jumps to next immediate instruction |
| -- e.g. 0002 JMP 1 => 0003 |
| else |
| -- We need to synthesise a condition that's always true, however |
| -- BPF prohibits pointer arithmetic to prevent pointer leaks |
| -- so we have to clear out one register and use it for cmp that's always true |
| local dst_reg = reg_alloc(stackslots) |
| V[stackslots].reg = nil -- Only temporary allocation |
| -- First branch point, emit compensation code |
| local Vcomp = Vstate[c] |
| if not Vcomp then |
| -- Force materialization of constants at the end of BB |
| for i, v in pairs(V) do |
| if not v.reg and cdef.isimmconst(v) then |
| vreg(i, dst_reg) -- Load to TMP register (not saved) |
| reg_spill(i) -- Spill caller-saved registers |
| end |
| end |
| -- Record variable state |
| Vstate[c] = V |
| V = table_copy(V) |
| -- Variable state already set, emit specific compensation code |
| else |
| bb_end(Vcomp) |
| end |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0) |
| emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0) |
| table.insert(val, code.pc-1) -- Fixup JMP target |
| code.reachable = false -- Code following the JMP is not reachable |
| code.fixup[c] = val |
| end |
| end, |
| RET1 = function (a, _, _, _) -- RET1 |
| -- Free optimisation: spilled variable will not be filled again |
| for i, v in pairs(V) do |
| if i ~= a then v.reg = nil end |
| end |
| if V[a].reg ~= 0 then vreg(a, 0) end |
| -- Convenience: dereference pointer variables |
| -- e.g. 'return map[k]' will return actual map value, not pointer |
| if cdef.isptr(V[a].type) then |
| vderef(0, 0, V[a]) |
| end |
| emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) |
| code.reachable = false |
| end, |
| RET0 = function (_, _, _, _) -- RET0 |
| emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) |
| emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) |
| code.reachable = false |
| end, |
| compile = function () |
| return code |
| end |
| } |
| |
| -- Composite instructions |
| function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1) |
| CALL(a, 1, d) |
| BC.RET1(a) |
| end |
| |
| -- Always initialize R6 with R1 context |
| emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0) |
| -- Register R6 as context variable (first argument) |
| if params and params > 0 then |
| vset(0, 6, param_types[1] or proto.skb) |
| assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo |
| end |
| -- Register tmpvars |
| vset(stackslots) |
| vset(stackslots+1) |
| return setmetatable(BC, { |
| __index = function (_, k, _) |
| if type(k) == 'number' then |
| local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6) |
| error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str)) |
| end |
| end, |
| __call = function (t, op, a, b, c, d) |
| code.bc_pc = code.bc_pc + 1 |
| -- Exitting BB straight through, emit compensation code |
| if Vstate[code.bc_pc] then |
| if code.reachable then |
| -- Instruction is reachable from previous line |
| -- so we must make the variable allocation consistent |
| -- with the variable allocation at the jump source |
| -- e.g. 0001 x:R0 = 5 |
| -- 0002 if rand() then goto 0005 |
| -- 0003 x:R0 -> x:stack |
| -- 0004 y:R0 = 5 |
| -- 0005 x:? = 10 <-- x was in R0 before jump, and stack after jump |
| bb_end(Vstate[code.bc_pc]) |
| else |
| -- Instruction isn't reachable from previous line, restore variable layout |
| -- e.g. RET or condition-less JMP on previous line |
| V = table_copy(Vstate[code.bc_pc]) |
| end |
| end |
| -- Perform fixup of jump targets |
| -- We need to do this because the number of consumed and emitted |
| -- bytecode instructions is different |
| local fixup = code.fixup[code.bc_pc] |
| if fixup ~= nil then |
| -- Patch JMP source insn with relative offset |
| for _,pc in ipairs(fixup) do |
| code.insn[pc].off = code.pc - 1 - pc |
| end |
| code.fixup[code.bc_pc] = nil |
| code.reachable = true |
| end |
| -- Execute |
| if code.reachable then |
| assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d)) |
| return t[op](a, b, c, d) |
| end |
| end, |
| }) |
| end |
| |
| -- Emitted code dump |
| local function dump_mem(cls, ins, _, fuse) |
| -- This is a very dense MEM instruction decoder without much explanation |
| -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format |
| local mode = bit.band(ins.code, 0xe0) |
| if mode == BPF.XADD then cls = 5 end -- The only mode |
| local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'} |
| local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'} |
| local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)] |
| local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed |
| local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off) |
| local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg |
| if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end |
| if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end |
| if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end |
| return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src) |
| end |
| |
| local function dump_alu(cls, ins, pc) |
| local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' } |
| local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'} |
| local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns', |
| 'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes', |
| 'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid', |
| 'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop', |
| 'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm', |
| 'perf_event_output', 'skb_load_bytes'} |
| local op = 0 |
| -- This is a very dense ALU instruction decoder without much explanation |
| -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format |
| for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end |
| local name = (cls == 5) and jmp[op] or alu[op] |
| local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm |
| local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or '' |
| if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end |
| return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target) |
| end |
| |
| local function dump_string(code, off, hide_counter) |
| if not code then return end |
| local cls_map = { |
| [0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem, |
| [4] = dump_alu, [5] = dump_alu, [7] = dump_alu, |
| } |
| local result = {} |
| local fused = false |
| for i = off or 0, code.pc - 1 do |
| local ins = code.insn[i] |
| local cls = bit.band(ins.code, 0x07) |
| local line = cls_map[cls](cls, ins, i, fused) |
| if hide_counter then |
| table.insert(result, line) |
| else |
| table.insert(result, string.format('%04u\t%s', i, line)) |
| end |
| fused = string.find(line, 'LDDW', 1) |
| end |
| return table.concat(result, '\n') |
| end |
| |
| local function dump(code) |
| if not code then return end |
| print(string.format('-- BPF %s:0-%u', code.insn, code.pc)) |
| print(dump_string(code)) |
| end |
| |
| local function compile(prog, params) |
| -- Create code emitter sandbox, include caller locals |
| local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi } |
| -- Include upvalues up to 4 nested scopes back |
| -- the narrower scope overrides broader scope |
| for k = 5, 2, -1 do |
| local i = 1 |
| while true do |
| local ok, n, v = pcall(debug.getlocal, k, i) |
| if not ok or not n then break end |
| env[n] = v |
| i = i + 1 |
| end |
| end |
| setmetatable(env, { |
| __index = function (_, k) |
| return proto[k] or builtins[k] or _G[k] |
| end |
| }) |
| -- Create code emitter and compile LuaJIT bytecode |
| if type(prog) == 'string' then prog = loadstring(prog) end |
| -- Create error handler to print traceback |
| local funci, pc = bytecode.funcinfo(prog), 0 |
| local E = create_emitter(env, funci.stackslots, funci.params, params or {}) |
| local on_err = function (e) |
| funci = bytecode.funcinfo(prog, pc) |
| local from, to = 0, 0 |
| for _ = 1, funci.currentline do |
| from = to |
| to = string.find(funci.source, '\n', from+1, true) or 0 |
| end |
| print(funci.loc..':'..string.sub(funci.source, from+1, to-1)) |
| print('error: '..e) |
| print(debug.traceback()) |
| end |
| for _,op,a,b,c,d in bytecode.decoder(prog) do |
| local ok, _, err = xpcall(E,on_err,op,a,b,c,d) |
| if not ok then |
| return nil, err |
| end |
| end |
| return E:compile() |
| end |
| |
| -- BPF map interface |
| local bpf_map_mt = { |
| __gc = function (map) S.close(map.fd) end, |
| __len = function(map) return map.max_entries end, |
| __index = function (map, k) |
| if type(k) == 'string' then |
| -- Return iterator |
| if k == 'pairs' then |
| return function(t, key) |
| -- Get next key |
| local next_key = ffi.new(ffi.typeof(t.key)) |
| local cur_key |
| if key then |
| cur_key = t.key |
| t.key[0] = key |
| else |
| cur_key = ffi.new(ffi.typeof(t.key)) |
| end |
| local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key) |
| if not ok then return nil, err end |
| -- Get next value |
| assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val)) |
| return next_key[0], map.val[0] |
| end, map, nil |
| -- Read for perf event map |
| elseif k == 'reader' then |
| return function (pmap, pid, cpu, event_type) |
| -- Caller must either specify PID or CPU |
| if not pid or pid < 0 then |
| assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs') |
| pid = -1 |
| end |
| -- Create BPF output reader |
| local pe = S.t.perf_event_attr1() |
| pe[0].type = 'software' |
| pe[0].config = 'sw_bpf_output' |
| pe[0].sample_type = 'raw' |
| pe[0].sample_period = 1 |
| pe[0].wakeup_events = 1 |
| local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1)) |
| if not reader then return nil, tostring(err) end |
| -- Register event reader fd in BPF map |
| assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu)) |
| pmap[cpu] = reader.fd |
| -- Open memory map and start reading |
| local ok, err = reader:start() |
| assert(ok, tostring(err)) |
| ok, err = reader:mmap() |
| assert(ok, tostring(err)) |
| return cdef.event_reader(reader, event_type) |
| end |
| -- Signalise this is a map type |
| end |
| return k == '__map' |
| end |
| -- Retrieve key |
| map.key[0] = k |
| local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val) |
| if not ok then return nil, err end |
| return ffi.new(map.val_type, map.val[0]) |
| end, |
| __newindex = function (map, k, v) |
| map.key[0] = k |
| if v == nil then |
| return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil) |
| end |
| map.val[0] = v |
| return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val) |
| end, |
| } |
| |
| -- Linux tracing interface |
| local function trace_check_enabled(path) |
| path = path or '/sys/kernel/debug/tracing' |
| if S.statfs(path) then return true end |
| return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?' |
| end |
| |
| -- Tracepoint interface |
| local tracepoint_mt = { |
| __index = { |
| bpf = function (t, prog) |
| if type(prog) ~= 'table' then |
| -- Create protocol parser with source probe |
| prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})}) |
| end |
| -- Load the BPF program |
| local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) |
| assert(prog_fd, tostring(err)..': '..tostring(log)) |
| -- Open tracepoint and attach |
| t.reader:setbpf(prog_fd:getfd()) |
| table.insert(t.progs, prog_fd) |
| return prog_fd |
| end, |
| } |
| } |
| -- Open tracepoint |
| local function tracepoint_open(path, pid, cpu, group_fd) |
| -- Open tracepoint and compile tracepoint type |
| local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path)) |
| local tp_type = assert(cdef.tracepoint_type(path)) |
| -- Open tracepoint reader and create interface |
| local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd)) |
| return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt) |
| end |
| |
| local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd) |
| -- Load BPF program |
| if type(prog) ~= 'table' then |
| prog = compile(prog, {proto.pt_regs}) |
| end |
| local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) |
| assert(prog_fd, tostring(err)..': '..tostring(log)) |
| -- Open tracepoint and attach |
| local tp, err = S.perf_probe(ptype, pname, pdef, retprobe) |
| if not tp then |
| prog_fd:close() |
| return nil, tostring(err) |
| end |
| local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'}) |
| if not reader then |
| prog_fd:close() |
| S.perf_probe(ptype, pname, false) |
| return nil, tostring(err) |
| end |
| local ok, err = reader:setbpf(prog_fd:getfd()) |
| if not ok then |
| prog_fd:close() |
| reader:close() |
| S.perf_probe(ptype, pname, false) |
| return nil, tostring(err)..' (kernel version should be at least 4.1)' |
| end |
| -- Create GC closure for reader to close BPF program |
| -- and detach probe in correct order |
| ffi.gc(reader, function () |
| prog_fd:close() |
| reader:close() |
| S.perf_probe(ptype, pname, false) |
| end) |
| return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype} |
| end |
| |
| -- Module interface |
| return setmetatable({ |
| new = create_emitter, |
| dump = dump, |
| dump_string = dump_string, |
| maps = {}, |
| map = function (type, max_entries, key_ctype, val_ctype) |
| if not key_ctype then key_ctype = ffi.typeof('uint32_t') end |
| if not val_ctype then val_ctype = ffi.typeof('uint32_t') end |
| if not max_entries then max_entries = 4096 end |
| -- Special case for BPF_MAP_STACK_TRACE |
| if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then |
| key_ctype = ffi.typeof('int32_t') |
| val_ctype = ffi.typeof('struct bpf_stacktrace') |
| end |
| local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries) |
| if not fd then return nil, tostring(err) end |
| local map = setmetatable({ |
| max_entries = max_entries, |
| key = ffi.new(ffi.typeof('$ [1]', key_ctype)), |
| val = ffi.new(ffi.typeof('$ [1]', val_ctype)), |
| map_type = S.c.BPF_MAP[type], |
| key_type = key_ctype, |
| val_type = val_ctype, |
| fd = fd:nogc():getfd(), |
| }, bpf_map_mt) |
| return map |
| end, |
| socket = function (sock, prog) |
| -- Expect socket type, if sock is string then assume it's |
| -- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket |
| local ok, err |
| if type(sock) == 'string' then |
| local iface = assert(S.nl.getlink())[sock] |
| assert(iface, sock..' is not interface name') |
| sock, err = S.socket('packet', 'raw') |
| assert(sock, tostring(err)) |
| ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index})) |
| assert(ok, tostring(err)) |
| elseif type(sock) == 'number' then |
| sock = S.t.fd(sock):nogc() |
| elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore |
| -- No cast required |
| else |
| return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket' |
| end |
| -- Load program and attach it to socket |
| if type(prog) ~= 'table' then |
| prog = compile(prog, {proto.skb}) |
| end |
| local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) |
| assert(prog_fd, tostring(err)..': '..tostring(log)) |
| assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd())) |
| return prog_fd, err |
| end, |
| tracepoint = function(tp, prog, pid, cpu, group_fd) |
| assert(trace_check_enabled()) |
| -- Return tracepoint instance if no program specified |
| -- this allows free specialisation of arg0 to tracepoint type |
| local probe = tracepoint_open(tp, pid, cpu, group_fd) |
| -- Load the BPF program |
| if prog then |
| probe:bpf(prog) |
| end |
| return probe |
| end, |
| kprobe = function(tp, prog, retprobe, pid, cpu, group_fd) |
| assert(trace_check_enabled()) |
| -- Open tracepoint and attach |
| local pname, pdef = tp:match('([^:]+):(.+)') |
| return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) |
| end, |
| uprobe = function(tp, prog, retprobe, pid, cpu, group_fd) |
| assert(trace_check_enabled()) |
| -- Translate symbol to address |
| local obj, sym_want = tp:match('([^:]+):(.+)') |
| if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end |
| -- Resolve Elf object (no support for anything else) |
| local elf = require('bpf.elf').open(obj) |
| local sym = elf:resolve(sym_want) |
| if not sym then return nil, 'no such symbol' end |
| sym = sym.st_value - elf:loadaddr() |
| local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)), |
| tonumber(ffi.cast('uint32_t', sym))) |
| -- Convert it to expected uprobe format |
| local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr) |
| local pdef = obj..':0x'..sym_addr |
| return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) |
| end, |
| tracelog = function(path) |
| assert(trace_check_enabled()) |
| path = path or '/sys/kernel/debug/tracing/trace_pipe' |
| return io.open(path, 'r') |
| end, |
| ntoh = builtins.ntoh, hton = builtins.hton, |
| }, { |
| __call = function (_, prog) return compile(prog) end, |
| }) |